server: Make corrtuption check optional and period configurable

Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
This commit is contained in:
Marek Siarkowicz 2022-07-05 07:03:46 -07:00
parent 6697fca97d
commit d44bbff278
8 changed files with 49 additions and 19 deletions

View File

@ -149,6 +149,8 @@ type ServerConfig struct {
// before serving any peer/client traffic. // before serving any peer/client traffic.
InitialCorruptCheck bool InitialCorruptCheck bool
CorruptCheckTime time.Duration CorruptCheckTime time.Duration
CompactHashCheckEnabled bool
CompactHashCheckTime time.Duration
// PreVote is true to enable Raft Pre-Vote. // PreVote is true to enable Raft Pre-Vote.
PreVote bool PreVote bool

View File

@ -322,6 +322,9 @@ type Config struct {
ExperimentalInitialCorruptCheck bool `json:"experimental-initial-corrupt-check"` ExperimentalInitialCorruptCheck bool `json:"experimental-initial-corrupt-check"`
ExperimentalCorruptCheckTime time.Duration `json:"experimental-corrupt-check-time"` ExperimentalCorruptCheckTime time.Duration `json:"experimental-corrupt-check-time"`
ExperimentalCompactHashCheckEnabled bool `json:"experimental-compact-hash-check-enabled"`
ExperimentalCompactHashCheckTime time.Duration `json:"experimental-compact-hash-check-time"`
// ExperimentalEnableLeaseCheckpoint enables leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change. // ExperimentalEnableLeaseCheckpoint enables leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change.
ExperimentalEnableLeaseCheckpoint bool `json:"experimental-enable-lease-checkpoint"` ExperimentalEnableLeaseCheckpoint bool `json:"experimental-enable-lease-checkpoint"`
// ExperimentalEnableLeaseCheckpointPersist enables persisting remainingTTL to prevent indefinite auto-renewal of long lived leases. Always enabled in v3.6. Should be used to ensure smooth upgrade from v3.5 clusters with this feature enabled. // ExperimentalEnableLeaseCheckpointPersist enables persisting remainingTTL to prevent indefinite auto-renewal of long lived leases. Always enabled in v3.6. Should be used to ensure smooth upgrade from v3.5 clusters with this feature enabled.
@ -521,6 +524,9 @@ func NewConfig() *Config {
ExperimentalTxnModeWriteWithSharedBuffer: true, ExperimentalTxnModeWriteWithSharedBuffer: true,
ExperimentalMaxLearners: membership.DefaultMaxLearners, ExperimentalMaxLearners: membership.DefaultMaxLearners,
ExperimentalCompactHashCheckEnabled: false,
ExperimentalCompactHashCheckTime: time.Minute,
V2Deprecation: config.V2_DEPR_DEFAULT, V2Deprecation: config.V2_DEPR_DEFAULT,
DiscoveryCfg: v3discovery.DiscoveryConfig{ DiscoveryCfg: v3discovery.DiscoveryConfig{
@ -759,6 +765,10 @@ func (cfg *Config) Validate() error {
return fmt.Errorf("setting experimental-enable-lease-checkpoint-persist requires experimental-enable-lease-checkpoint") return fmt.Errorf("setting experimental-enable-lease-checkpoint-persist requires experimental-enable-lease-checkpoint")
} }
if cfg.ExperimentalCompactHashCheckTime <= 0 {
return fmt.Errorf("--experimental-compact-hash-check-time must be >0 (set to %v)", cfg.ExperimentalCompactHashCheckTime)
}
return nil return nil
} }

View File

@ -202,6 +202,8 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
HostWhitelist: cfg.HostWhitelist, HostWhitelist: cfg.HostWhitelist,
InitialCorruptCheck: cfg.ExperimentalInitialCorruptCheck, InitialCorruptCheck: cfg.ExperimentalInitialCorruptCheck,
CorruptCheckTime: cfg.ExperimentalCorruptCheckTime, CorruptCheckTime: cfg.ExperimentalCorruptCheckTime,
CompactHashCheckEnabled: cfg.ExperimentalCompactHashCheckEnabled,
CompactHashCheckTime: cfg.ExperimentalCompactHashCheckTime,
PreVote: cfg.PreVote, PreVote: cfg.PreVote,
Logger: cfg.logger, Logger: cfg.logger,
ForceNewCluster: cfg.ForceNewCluster, ForceNewCluster: cfg.ForceNewCluster,
@ -344,6 +346,8 @@ func print(lg *zap.Logger, ec Config, sc config.ServerConfig, memberInitialized
zap.Bool("pre-vote", sc.PreVote), zap.Bool("pre-vote", sc.PreVote),
zap.Bool("initial-corrupt-check", sc.InitialCorruptCheck), zap.Bool("initial-corrupt-check", sc.InitialCorruptCheck),
zap.String("corrupt-check-time-interval", sc.CorruptCheckTime.String()), zap.String("corrupt-check-time-interval", sc.CorruptCheckTime.String()),
zap.Bool("compact-check-time-enabled", sc.CompactHashCheckEnabled),
zap.Duration("compact-check-time-interval", sc.CompactHashCheckTime),
zap.String("auto-compaction-mode", sc.AutoCompactionMode), zap.String("auto-compaction-mode", sc.AutoCompactionMode),
zap.Duration("auto-compaction-retention", sc.AutoCompactionRetention), zap.Duration("auto-compaction-retention", sc.AutoCompactionRetention),
zap.String("auto-compaction-interval", sc.AutoCompactionRetention.String()), zap.String("auto-compaction-interval", sc.AutoCompactionRetention.String()),

View File

@ -259,6 +259,8 @@ func newConfig() *config {
// experimental // experimental
fs.BoolVar(&cfg.ec.ExperimentalInitialCorruptCheck, "experimental-initial-corrupt-check", cfg.ec.ExperimentalInitialCorruptCheck, "Enable to check data corruption before serving any client/peer traffic.") fs.BoolVar(&cfg.ec.ExperimentalInitialCorruptCheck, "experimental-initial-corrupt-check", cfg.ec.ExperimentalInitialCorruptCheck, "Enable to check data corruption before serving any client/peer traffic.")
fs.DurationVar(&cfg.ec.ExperimentalCorruptCheckTime, "experimental-corrupt-check-time", cfg.ec.ExperimentalCorruptCheckTime, "Duration of time between cluster corruption check passes.") fs.DurationVar(&cfg.ec.ExperimentalCorruptCheckTime, "experimental-corrupt-check-time", cfg.ec.ExperimentalCorruptCheckTime, "Duration of time between cluster corruption check passes.")
fs.BoolVar(&cfg.ec.ExperimentalCompactHashCheckEnabled, "experimental-compact-hash-check-enabled", cfg.ec.ExperimentalCompactHashCheckEnabled, "Enable leader to periodically check followers compaction hashes.")
fs.DurationVar(&cfg.ec.ExperimentalCompactHashCheckTime, "experimental-compact-hash-check-time", cfg.ec.ExperimentalCompactHashCheckTime, "Duration of time between leader checks followers compaction hashes.")
fs.BoolVar(&cfg.ec.ExperimentalEnableLeaseCheckpoint, "experimental-enable-lease-checkpoint", false, "Enable leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change.") fs.BoolVar(&cfg.ec.ExperimentalEnableLeaseCheckpoint, "experimental-enable-lease-checkpoint", false, "Enable leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change.")
// TODO: delete in v3.7 // TODO: delete in v3.7

View File

@ -301,14 +301,13 @@ func (cm *corruptionChecker) CompactHashCheck() {
cm.mux.Unlock() cm.mux.Unlock()
cm.lg.Info("finished compaction hash check", zap.Int("number-of-hashes-checked", i+1)) cm.lg.Info("finished compaction hash check", zap.Int("number-of-hashes-checked", i+1))
return return
} else { }
cm.lg.Warn("skipped checking hash; was not able to check all peers", cm.lg.Warn("skipped revision in compaction hash check; was not able to check all peers",
zap.Int("number-of-peers-checked", peersChecked), zap.Int("number-of-peers-checked", peersChecked),
zap.Int("number-of-peers", len(peers)), zap.Int("number-of-peers", len(peers)),
zap.Int64("revision", hash.Revision), zap.Int64("revision", hash.Revision),
) )
} }
}
cm.lg.Info("finished compaction hash check", zap.Int("number-of-hashes-checked", len(hashes))) cm.lg.Info("finished compaction hash check", zap.Int("number-of-hashes-checked", len(hashes)))
return return
} }

View File

@ -112,7 +112,6 @@ var (
// on the connection. Or we will not be able to reuse the connection // on the connection. Or we will not be able to reuse the connection
// (since it will timeout). // (since it will timeout).
monitorVersionInterval = rafthttp.ConnWriteTimeout - time.Second monitorVersionInterval = rafthttp.ConnWriteTimeout - time.Second
CompactHashCheckInterval = 15 * time.Second
recommendedMaxRequestBytesString = humanize.Bytes(uint64(recommendedMaxRequestBytes)) recommendedMaxRequestBytesString = humanize.Bytes(uint64(recommendedMaxRequestBytes))
storeMemberAttributeRegexp = regexp.MustCompile(path.Join(membership.StoreMembersPrefix, "[[:xdigit:]]{1,16}", "attributes")) storeMemberAttributeRegexp = regexp.MustCompile(path.Join(membership.StoreMembersPrefix, "[[:xdigit:]]{1,16}", "attributes"))
@ -2219,9 +2218,13 @@ func (s *EtcdServer) monitorKVHash() {
} }
func (s *EtcdServer) monitorCompactHash() { func (s *EtcdServer) monitorCompactHash() {
if !s.Cfg.CompactHashCheckEnabled {
return
}
t := s.Cfg.CompactHashCheckTime
for { for {
select { select {
case <-time.After(CompactHashCheckInterval): case <-time.After(t):
case <-s.stopping: case <-s.stopping:
return return
} }

View File

@ -23,7 +23,6 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"go.etcd.io/etcd/api/v3/etcdserverpb" "go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/client/v3" "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/server/v3/etcdserver"
"go.etcd.io/etcd/server/v3/storage/datadir" "go.etcd.io/etcd/server/v3/storage/datadir"
"go.etcd.io/etcd/server/v3/storage/mvcc/testutil" "go.etcd.io/etcd/server/v3/storage/mvcc/testutil"
"go.etcd.io/etcd/tests/v3/framework/config" "go.etcd.io/etcd/tests/v3/framework/config"
@ -136,10 +135,13 @@ func TestPeriodicCheckDetectsCorruption(t *testing.T) {
} }
func TestCompactHashCheckDetectCorruption(t *testing.T) { func TestCompactHashCheckDetectCorruption(t *testing.T) {
checkTime := time.Second
e2e.BeforeTest(t) e2e.BeforeTest(t)
epc, err := e2e.NewEtcdProcessCluster(t, &e2e.EtcdProcessClusterConfig{ epc, err := e2e.NewEtcdProcessCluster(t, &e2e.EtcdProcessClusterConfig{
ClusterSize: 3, ClusterSize: 3,
KeepDataDir: true, KeepDataDir: true,
CompactHashCheckEnabled: true,
CompactHashCheckTime: checkTime,
}) })
if err != nil { if err != nil {
t.Fatalf("could not start etcd process cluster (%v)", err) t.Fatalf("could not start etcd process cluster (%v)", err)
@ -173,7 +175,7 @@ func TestCompactHashCheckDetectCorruption(t *testing.T) {
assert.NoError(t, err) assert.NoError(t, err)
_, err = cc.Compact(5, config.CompactOption{}) _, err = cc.Compact(5, config.CompactOption{})
assert.NoError(t, err) assert.NoError(t, err)
time.Sleep(etcdserver.CompactHashCheckInterval * 11 / 10) time.Sleep(checkTime * 11 / 10)
alarmResponse, err := cc.AlarmList() alarmResponse, err := cc.AlarmList()
assert.NoError(t, err, "error on alarm list") assert.NoError(t, err, "error on alarm list")
assert.Equal(t, []*etcdserverpb.AlarmMember{{Alarm: etcdserverpb.AlarmType_CORRUPT, MemberID: memberID}}, alarmResponse.Alarms) assert.Equal(t, []*etcdserverpb.AlarmMember{{Alarm: etcdserverpb.AlarmType_CORRUPT, MemberID: memberID}}, alarmResponse.Alarms)

View File

@ -179,6 +179,8 @@ type EtcdProcessClusterConfig struct {
MaxConcurrentStreams uint32 // default is math.MaxUint32 MaxConcurrentStreams uint32 // default is math.MaxUint32
CorruptCheckTime time.Duration CorruptCheckTime time.Duration
CompactHashCheckEnabled bool
CompactHashCheckTime time.Duration
} }
// NewEtcdProcessCluster launches a new cluster from etcd processes, returning // NewEtcdProcessCluster launches a new cluster from etcd processes, returning
@ -351,6 +353,12 @@ func (cfg *EtcdProcessClusterConfig) EtcdServerProcessConfigs(tb testing.TB) []*
if cfg.CorruptCheckTime != 0 { if cfg.CorruptCheckTime != 0 {
args = append(args, "--experimental-corrupt-check-time", fmt.Sprintf("%s", cfg.CorruptCheckTime)) args = append(args, "--experimental-corrupt-check-time", fmt.Sprintf("%s", cfg.CorruptCheckTime))
} }
if cfg.CompactHashCheckEnabled {
args = append(args, "--experimental-compact-hash-check-enabled")
}
if cfg.CompactHashCheckTime != 0 {
args = append(args, "--experimental-compact-hash-check-time", cfg.CompactHashCheckTime.String())
}
etcdCfgs[i] = &EtcdServerProcessConfig{ etcdCfgs[i] = &EtcdServerProcessConfig{
lg: lg, lg: lg,