functional-tester/tester: refactor "Failure" to support liveness mode
Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
This commit is contained in:
@ -18,6 +18,8 @@ import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
)
|
||||
|
||||
// Failure defines failure injection interface.
|
||||
@ -33,28 +35,32 @@ type Failure interface {
|
||||
Recover(clus *Cluster) error
|
||||
// Desc returns a description of the failure
|
||||
Desc() string
|
||||
// FailureCase returns "rpcpb.FailureCase" enum type.
|
||||
FailureCase() rpcpb.FailureCase
|
||||
}
|
||||
|
||||
type description string
|
||||
|
||||
func (d description) Desc() string { return string(d) }
|
||||
|
||||
type injectMemberFunc func(*Cluster, int) error
|
||||
type recoverMemberFunc func(*Cluster, int) error
|
||||
|
||||
type failureByFunc struct {
|
||||
description
|
||||
desc
|
||||
failureCase rpcpb.FailureCase
|
||||
injectMember injectMemberFunc
|
||||
recoverMember recoverMemberFunc
|
||||
}
|
||||
|
||||
type failureFollower struct {
|
||||
failureByFunc
|
||||
last int
|
||||
lead int
|
||||
func (f *failureByFunc) Desc() string {
|
||||
if string(f.desc) != "" {
|
||||
return string(f.desc)
|
||||
}
|
||||
return f.failureCase.String()
|
||||
}
|
||||
|
||||
type failureLeader struct {
|
||||
func (f *failureByFunc) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
type failureFollower struct {
|
||||
failureByFunc
|
||||
last int
|
||||
lead int
|
||||
@ -82,22 +88,6 @@ func (f *failureFollower) updateIndex(clus *Cluster) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *failureLeader) updateIndex(clus *Cluster) error {
|
||||
idx, err := clus.GetLeader()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f.lead = idx
|
||||
f.last = idx
|
||||
return nil
|
||||
}
|
||||
|
||||
type failureQuorum failureByFunc
|
||||
type failureAll failureByFunc
|
||||
|
||||
// failureUntilSnapshot injects a failure and waits for a snapshot event
|
||||
type failureUntilSnapshot struct{ Failure }
|
||||
|
||||
func (f *failureFollower) Inject(clus *Cluster) error {
|
||||
if err := f.updateIndex(clus); err != nil {
|
||||
return err
|
||||
@ -109,6 +99,24 @@ func (f *failureFollower) Recover(clus *Cluster) error {
|
||||
return f.recoverMember(clus, f.last)
|
||||
}
|
||||
|
||||
func (f *failureFollower) FailureCase() rpcpb.FailureCase { return f.failureCase }
|
||||
|
||||
type failureLeader struct {
|
||||
failureByFunc
|
||||
last int
|
||||
lead int
|
||||
}
|
||||
|
||||
func (f *failureLeader) updateIndex(clus *Cluster) error {
|
||||
idx, err := clus.GetLeader()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f.lead = idx
|
||||
f.last = idx
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *failureLeader) Inject(clus *Cluster) error {
|
||||
if err := f.updateIndex(clus); err != nil {
|
||||
return err
|
||||
@ -120,6 +128,12 @@ func (f *failureLeader) Recover(clus *Cluster) error {
|
||||
return f.recoverMember(clus, f.last)
|
||||
}
|
||||
|
||||
func (f *failureLeader) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
type failureQuorum failureByFunc
|
||||
|
||||
func (f *failureQuorum) Inject(clus *Cluster) error {
|
||||
for i := range killMap(len(clus.Members), clus.rd) {
|
||||
if err := f.injectMember(clus, i); err != nil {
|
||||
@ -138,6 +152,10 @@ func (f *failureQuorum) Recover(clus *Cluster) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *failureQuorum) FailureCase() rpcpb.FailureCase { return f.failureCase }
|
||||
|
||||
type failureAll failureByFunc
|
||||
|
||||
func (f *failureAll) Inject(clus *Cluster) error {
|
||||
for i := range clus.Members {
|
||||
if err := f.injectMember(clus, i); err != nil {
|
||||
@ -156,6 +174,18 @@ func (f *failureAll) Recover(clus *Cluster) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *failureAll) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
// failureUntilSnapshot injects a failure and waits for a snapshot event
|
||||
type failureUntilSnapshot struct {
|
||||
desc desc
|
||||
failureCase rpcpb.FailureCase
|
||||
|
||||
Failure
|
||||
}
|
||||
|
||||
const snapshotCount = 10000
|
||||
|
||||
func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
|
||||
@ -190,7 +220,14 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
|
||||
}
|
||||
|
||||
func (f *failureUntilSnapshot) Desc() string {
|
||||
return f.Failure.Desc() + " for a long time and expect it to recover from an incoming snapshot"
|
||||
if f.desc.Desc() != "" {
|
||||
return f.desc.Desc()
|
||||
}
|
||||
return f.failureCase.String() + " (to trigger snapshot)"
|
||||
}
|
||||
|
||||
func (f *failureUntilSnapshot) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
func killMap(size int, seed int) map[int]bool {
|
||||
@ -204,3 +241,7 @@ func killMap(size int, seed int) map[int]bool {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type desc string
|
||||
|
||||
func (d desc) Desc() string { return string(d) }
|
||||
|
@ -17,12 +17,16 @@ package tester
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
)
|
||||
|
||||
type failureExternal struct {
|
||||
Failure
|
||||
|
||||
description string
|
||||
desc string
|
||||
failureCase rpcpb.FailureCase
|
||||
|
||||
scriptPath string
|
||||
}
|
||||
|
||||
@ -34,11 +38,18 @@ func (f *failureExternal) Recover(clus *Cluster) error {
|
||||
return exec.Command(f.scriptPath, "disable", fmt.Sprintf("%d", clus.rd)).Run()
|
||||
}
|
||||
|
||||
func (f *failureExternal) Desc() string { return f.description }
|
||||
func (f *failureExternal) Desc() string {
|
||||
return f.desc
|
||||
}
|
||||
|
||||
func (f *failureExternal) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
func newFailureExternal(scriptPath string) Failure {
|
||||
return &failureExternal{
|
||||
description: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
|
||||
desc: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
|
||||
failureCase: rpcpb.FailureCase_EXTERNAL,
|
||||
scriptPath: scriptPath,
|
||||
}
|
||||
}
|
||||
|
@ -21,6 +21,8 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
)
|
||||
|
||||
type failpointStats struct {
|
||||
@ -42,14 +44,23 @@ func failpointFailures(clus *Cluster) (ret []Failure, err error) {
|
||||
if len(fp) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
fpFails := failuresFromFailpoint(fp, clus.Tester.FailpointCommands)
|
||||
|
||||
// wrap in delays so failpoint has time to trigger
|
||||
for i, fpf := range fpFails {
|
||||
if strings.Contains(fp, "Snap") {
|
||||
// hack to trigger snapshot failpoints
|
||||
fpFails[i] = &failureUntilSnapshot{fpf}
|
||||
fpFails[i] = &failureUntilSnapshot{
|
||||
desc: desc(fpf.Desc()),
|
||||
failureCase: rpcpb.FailureCase_FAILPOINTS,
|
||||
Failure: fpf,
|
||||
}
|
||||
} else {
|
||||
fpFails[i] = &failureDelay{fpf, 3 * time.Second}
|
||||
fpFails[i] = &failureDelay{
|
||||
Failure: fpf,
|
||||
delayDuration: 3 * time.Second,
|
||||
}
|
||||
}
|
||||
}
|
||||
ret = append(ret, fpFails...)
|
||||
@ -85,7 +96,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
|
||||
fs = append(fs, []Failure{
|
||||
&failureFollower{
|
||||
failureByFunc: failureByFunc{
|
||||
description: description(fmt.Sprintf("failpoint %s (one: %s)", fp, fcmd)),
|
||||
desc: desc(fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd)),
|
||||
failureCase: rpcpb.FailureCase_FAILPOINTS,
|
||||
injectMember: inject,
|
||||
recoverMember: recov,
|
||||
},
|
||||
@ -94,7 +106,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
|
||||
},
|
||||
&failureLeader{
|
||||
failureByFunc: failureByFunc{
|
||||
description: description(fmt.Sprintf("failpoint %s (leader: %s)", fp, fcmd)),
|
||||
desc: desc(fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd)),
|
||||
failureCase: rpcpb.FailureCase_FAILPOINTS,
|
||||
injectMember: inject,
|
||||
recoverMember: recov,
|
||||
},
|
||||
@ -102,12 +115,14 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
|
||||
lead: -1,
|
||||
},
|
||||
&failureQuorum{
|
||||
description: description(fmt.Sprintf("failpoint %s (quorum: %s)", fp, fcmd)),
|
||||
desc: desc(fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd)),
|
||||
failureCase: rpcpb.FailureCase_FAILPOINTS,
|
||||
injectMember: inject,
|
||||
recoverMember: recov,
|
||||
},
|
||||
&failureAll{
|
||||
description: description(fmt.Sprintf("failpoint %s (all: %s)", fp, fcmd)),
|
||||
desc: desc(fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd)),
|
||||
failureCase: rpcpb.FailureCase_FAILPOINTS,
|
||||
injectMember: inject,
|
||||
recoverMember: recov,
|
||||
},
|
||||
|
@ -26,7 +26,7 @@ func recoverKill(clus *Cluster, idx int) error {
|
||||
|
||||
func newFailureKillOneFollower() Failure {
|
||||
ff := failureByFunc{
|
||||
description: "kill one follower",
|
||||
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
@ -35,7 +35,7 @@ func newFailureKillOneFollower() Failure {
|
||||
|
||||
func newFailureKillLeader() Failure {
|
||||
ff := failureByFunc{
|
||||
description: "kill leader",
|
||||
failureCase: rpcpb.FailureCase_KILL_LEADER,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
@ -44,7 +44,7 @@ func newFailureKillLeader() Failure {
|
||||
|
||||
func newFailureKillQuorum() Failure {
|
||||
return &failureQuorum{
|
||||
description: "kill quorum",
|
||||
failureCase: rpcpb.FailureCase_KILL_QUORUM,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
@ -52,16 +52,22 @@ func newFailureKillQuorum() Failure {
|
||||
|
||||
func newFailureKillAll() Failure {
|
||||
return &failureAll{
|
||||
description: "kill all",
|
||||
failureCase: rpcpb.FailureCase_KILL_ALL,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureKillOneFollowerForLongTime() Failure {
|
||||
return &failureUntilSnapshot{newFailureKillOneFollower()}
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_FOR_LONG,
|
||||
Failure: newFailureKillOneFollower(),
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureKillLeaderForLongTime() Failure {
|
||||
return &failureUntilSnapshot{newFailureKillLeader()}
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: rpcpb.FailureCase_KILL_LEADER_FOR_LONG,
|
||||
Failure: newFailureKillLeader(),
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ func recoverBlackholePeerPortTxRx(clus *Cluster, idx int) error {
|
||||
|
||||
func newFailureBlackholePeerPortTxRxOneFollower() Failure {
|
||||
ff := failureByFunc{
|
||||
description: "blackhole peer port on one follower",
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
}
|
||||
@ -39,7 +39,7 @@ func newFailureBlackholePeerPortTxRxOneFollower() Failure {
|
||||
|
||||
func newFailureBlackholePeerPortTxRxLeader() Failure {
|
||||
ff := failureByFunc{
|
||||
description: "blackhole peer port on leader",
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
}
|
||||
@ -52,7 +52,7 @@ func newFailureBlackholePeerPortTxRxLeader() Failure {
|
||||
|
||||
func newFailureBlackholePeerPortTxRxAll() Failure {
|
||||
f := &failureAll{
|
||||
description: "blackhole peer port on all",
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
}
|
||||
|
@ -15,7 +15,6 @@
|
||||
package tester
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
@ -43,9 +42,8 @@ func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error {
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
|
||||
desc := fmt.Sprintf("delay follower peer port by %d ms", clus.Tester.DelayLatencyMs)
|
||||
ff := failureByFunc{
|
||||
description: description(desc),
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
}
|
||||
@ -57,9 +55,8 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
|
||||
desc := fmt.Sprintf("delay leader peer port by %d ms", clus.Tester.DelayLatencyMs)
|
||||
ff := failureByFunc{
|
||||
description: description(desc),
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
}
|
||||
@ -71,9 +68,8 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxAll(clus *Cluster) Failure {
|
||||
desc := fmt.Sprintf("delay all peer port by %d ms", clus.Tester.DelayLatencyMs)
|
||||
f := &failureAll{
|
||||
description: description(desc),
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
}
|
||||
|
@ -14,13 +14,24 @@
|
||||
|
||||
package tester
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
)
|
||||
|
||||
type failureNoOp failureByFunc
|
||||
|
||||
func (f *failureNoOp) Inject(clus *Cluster) error { return nil }
|
||||
func (f *failureNoOp) Recover(clus *Cluster) error { return nil }
|
||||
func (f *failureNoOp) FailureCase() rpcpb.FailureCase { return f.failureCase }
|
||||
|
||||
func newFailureNoOp() Failure {
|
||||
return &failureNoOp{
|
||||
description: "no failure",
|
||||
f := &failureNoOp{
|
||||
failureCase: rpcpb.FailureCase_NO_FAIL_WITH_STRESS,
|
||||
}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: 5 * time.Second,
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user