raft: internally support joint consensus
This commit introduces machinery to safely apply joint consensus configuration changes to Raft. The main contribution is the new package, `confchange`, which offers the primitives `Simple`, `EnterJoint`, and `LeaveJoint`. The first two take a list of configuration changes. `Simple` only declares success if these configuration changes (applied atomically) change the set of voters by at most one (i.e. it's fine to add or remove any number of learners, but change only one voter). `EnterJoint` makes the configuration joint and then applies the changes to it, in preparation of the caller returning later and transitioning out of the joint config into the final desired configuration via `LeaveJoint()`. This commit streamlines the conversion between voters and learners, which is now generally allowed whenever the above conditions are upheld (i.e. it's not possible to demote a voter and add a new voter in the context of a Simple configuration change, but it is possible via EnterJoint). Previously, we had the artificial restriction that a voter could not be demoted to a learner, but had to be removed first. Even though demoting a learner is generally less useful than promoting a learner (the latter is used to catch up future voters), demotions could see use in improved handling of temporary node unavailability, where it is desired to remove voting power from a down node, but to preserve its data should it return. An additional change that was made in this commit is to prevent the use of empty commit quorums, which was previously possible but for no good reason; this: Closes #10884. The work left to do in a future PR is to actually expose joint configurations to the applications using Raft. This will entail mostly API design and the addition of suitable testing, which to be carried out ergonomically is likely to motivate a larger refactor. Touches #7625.
This commit is contained in:
@ -17,6 +17,7 @@ package tracker
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"go.etcd.io/etcd/raft/quorum"
|
||||
)
|
||||
@ -33,12 +34,11 @@ type Config struct {
|
||||
// simplifies the implementation since it allows peers to have clarity about
|
||||
// its current role without taking into account joint consensus.
|
||||
Learners map[uint64]struct{}
|
||||
// TODO(tbg): when we actually carry out joint consensus changes and turn a
|
||||
// voter into a learner, we cannot add the learner when entering the joint
|
||||
// state. This is because this would violate the invariant that the inter-
|
||||
// section of voters and learners is empty. For example, assume a Voter is
|
||||
// removed and immediately re-added as a learner (or in other words, it is
|
||||
// demoted).
|
||||
// When we turn a voter into a learner during a joint consensus transition,
|
||||
// we cannot add the learner directly when entering the joint state. This is
|
||||
// because this would violate the invariant that the intersection of
|
||||
// voters and learners is empty. For example, assume a Voter is removed and
|
||||
// immediately re-added as a learner (or in other words, it is demoted):
|
||||
//
|
||||
// Initially, the configuration will be
|
||||
//
|
||||
@ -51,7 +51,7 @@ type Config struct {
|
||||
// learners: {3}
|
||||
//
|
||||
// but this violates the invariant (3 is both voter and learner). Instead,
|
||||
// we have
|
||||
// we get
|
||||
//
|
||||
// voters: {1 2} & {1 2 3}
|
||||
// learners: {}
|
||||
@ -66,20 +66,40 @@ type Config struct {
|
||||
//
|
||||
// Note that next_learners is not used while adding a learner that is not
|
||||
// also a voter in the joint config. In this case, the learner is added
|
||||
// to Learners right away when entering the joint configuration, so that it
|
||||
// is caught up as soon as possible.
|
||||
//
|
||||
// NextLearners map[uint64]struct{}
|
||||
// right away when entering the joint configuration, so that it is caught up
|
||||
// as soon as possible.
|
||||
LearnersNext map[uint64]struct{}
|
||||
}
|
||||
|
||||
func (c *Config) String() string {
|
||||
if len(c.Learners) == 0 {
|
||||
return fmt.Sprintf("voters=%s", c.Voters)
|
||||
func (c Config) String() string {
|
||||
var buf strings.Builder
|
||||
fmt.Fprintf(&buf, "voters=%s", c.Voters)
|
||||
if c.Learners != nil {
|
||||
fmt.Fprintf(&buf, " learners=%s", quorum.MajorityConfig(c.Learners).String())
|
||||
}
|
||||
if c.LearnersNext != nil {
|
||||
fmt.Fprintf(&buf, " learners_next=%s", quorum.MajorityConfig(c.LearnersNext).String())
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// Clone returns a copy of the Config that shares no memory with the original.
|
||||
func (c *Config) Clone() Config {
|
||||
clone := func(m map[uint64]struct{}) map[uint64]struct{} {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
mm := make(map[uint64]struct{}, len(m))
|
||||
for k := range m {
|
||||
mm[k] = struct{}{}
|
||||
}
|
||||
return mm
|
||||
}
|
||||
return Config{
|
||||
Voters: quorum.JointConfig{clone(c.Voters[0]), clone(c.Voters[1])},
|
||||
Learners: clone(c.Learners),
|
||||
LearnersNext: clone(c.LearnersNext),
|
||||
}
|
||||
return fmt.Sprintf(
|
||||
"voters=%s learners=%s",
|
||||
c.Voters, quorum.MajorityConfig(c.Learners).String(),
|
||||
)
|
||||
}
|
||||
|
||||
// ProgressTracker tracks the currently active configuration and the information
|
||||
@ -88,7 +108,7 @@ func (c *Config) String() string {
|
||||
type ProgressTracker struct {
|
||||
Config
|
||||
|
||||
Progress map[uint64]*Progress
|
||||
Progress ProgressMap
|
||||
|
||||
Votes map[uint64]bool
|
||||
|
||||
@ -102,11 +122,10 @@ func MakeProgressTracker(maxInflight int) ProgressTracker {
|
||||
Config: Config{
|
||||
Voters: quorum.JointConfig{
|
||||
quorum.MajorityConfig{},
|
||||
// TODO(tbg): this will be mostly empty, so make it a nil pointer
|
||||
// in the common case.
|
||||
quorum.MajorityConfig{},
|
||||
nil, // only populated when used
|
||||
},
|
||||
Learners: map[uint64]struct{}{},
|
||||
Learners: nil, // only populated when used
|
||||
LearnersNext: nil, // only populated when used
|
||||
},
|
||||
Votes: map[uint64]bool{},
|
||||
Progress: map[uint64]*Progress{},
|
||||
@ -139,44 +158,6 @@ func (p *ProgressTracker) Committed() uint64 {
|
||||
return uint64(p.Voters.CommittedIndex(matchAckIndexer(p.Progress)))
|
||||
}
|
||||
|
||||
// RemoveAny removes this peer, which *must* be tracked as a voter or learner,
|
||||
// from the tracker.
|
||||
func (p *ProgressTracker) RemoveAny(id uint64) {
|
||||
_, okPR := p.Progress[id]
|
||||
_, okV1 := p.Voters[0][id]
|
||||
_, okV2 := p.Voters[1][id]
|
||||
_, okL := p.Learners[id]
|
||||
|
||||
okV := okV1 || okV2
|
||||
|
||||
if !okPR {
|
||||
panic("attempting to remove unknown peer %x")
|
||||
} else if !okV && !okL {
|
||||
panic("attempting to remove unknown peer %x")
|
||||
} else if okV && okL {
|
||||
panic(fmt.Sprintf("peer %x is both voter and learner", id))
|
||||
}
|
||||
|
||||
delete(p.Voters[0], id)
|
||||
delete(p.Voters[1], id)
|
||||
delete(p.Learners, id)
|
||||
delete(p.Progress, id)
|
||||
}
|
||||
|
||||
// InitProgress initializes a new progress for the given node or learner. The
|
||||
// node may not exist yet in either form or a panic will ensue.
|
||||
func (p *ProgressTracker) InitProgress(id, match, next uint64, isLearner bool) {
|
||||
if pr := p.Progress[id]; pr != nil {
|
||||
panic(fmt.Sprintf("peer %x already tracked as node %v", id, pr))
|
||||
}
|
||||
if !isLearner {
|
||||
p.Voters[0][id] = struct{}{}
|
||||
} else {
|
||||
p.Learners[id] = struct{}{}
|
||||
}
|
||||
p.Progress[id] = &Progress{Next: next, Match: match, Inflights: NewInflights(p.MaxInflight), IsLearner: isLearner}
|
||||
}
|
||||
|
||||
// Visit invokes the supplied closure for all tracked progresses.
|
||||
func (p *ProgressTracker) Visit(f func(id uint64, pr *Progress)) {
|
||||
for id, pr := range p.Progress {
|
||||
|
Reference in New Issue
Block a user