functional-tester/tester: refactor "Failure" to support liveness mode

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
This commit is contained in:
Gyuho Lee
2018-04-04 13:03:57 -07:00
parent 3510e9b94c
commit b3fea7ed53
7 changed files with 137 additions and 57 deletions

View File

@ -18,6 +18,8 @@ import (
"fmt"
"math/rand"
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
)
// Failure defines failure injection interface.
@ -33,28 +35,32 @@ type Failure interface {
Recover(clus *Cluster) error
// Desc returns a description of the failure
Desc() string
// FailureCase returns "rpcpb.FailureCase" enum type.
FailureCase() rpcpb.FailureCase
}
type description string
func (d description) Desc() string { return string(d) }
type injectMemberFunc func(*Cluster, int) error
type recoverMemberFunc func(*Cluster, int) error
type failureByFunc struct {
description
desc
failureCase rpcpb.FailureCase
injectMember injectMemberFunc
recoverMember recoverMemberFunc
}
type failureFollower struct {
failureByFunc
last int
lead int
func (f *failureByFunc) Desc() string {
if string(f.desc) != "" {
return string(f.desc)
}
return f.failureCase.String()
}
type failureLeader struct {
func (f *failureByFunc) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
type failureFollower struct {
failureByFunc
last int
lead int
@ -82,22 +88,6 @@ func (f *failureFollower) updateIndex(clus *Cluster) error {
return nil
}
func (f *failureLeader) updateIndex(clus *Cluster) error {
idx, err := clus.GetLeader()
if err != nil {
return err
}
f.lead = idx
f.last = idx
return nil
}
type failureQuorum failureByFunc
type failureAll failureByFunc
// failureUntilSnapshot injects a failure and waits for a snapshot event
type failureUntilSnapshot struct{ Failure }
func (f *failureFollower) Inject(clus *Cluster) error {
if err := f.updateIndex(clus); err != nil {
return err
@ -109,6 +99,24 @@ func (f *failureFollower) Recover(clus *Cluster) error {
return f.recoverMember(clus, f.last)
}
func (f *failureFollower) FailureCase() rpcpb.FailureCase { return f.failureCase }
type failureLeader struct {
failureByFunc
last int
lead int
}
func (f *failureLeader) updateIndex(clus *Cluster) error {
idx, err := clus.GetLeader()
if err != nil {
return err
}
f.lead = idx
f.last = idx
return nil
}
func (f *failureLeader) Inject(clus *Cluster) error {
if err := f.updateIndex(clus); err != nil {
return err
@ -120,6 +128,12 @@ func (f *failureLeader) Recover(clus *Cluster) error {
return f.recoverMember(clus, f.last)
}
func (f *failureLeader) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
type failureQuorum failureByFunc
func (f *failureQuorum) Inject(clus *Cluster) error {
for i := range killMap(len(clus.Members), clus.rd) {
if err := f.injectMember(clus, i); err != nil {
@ -138,6 +152,10 @@ func (f *failureQuorum) Recover(clus *Cluster) error {
return nil
}
func (f *failureQuorum) FailureCase() rpcpb.FailureCase { return f.failureCase }
type failureAll failureByFunc
func (f *failureAll) Inject(clus *Cluster) error {
for i := range clus.Members {
if err := f.injectMember(clus, i); err != nil {
@ -156,6 +174,18 @@ func (f *failureAll) Recover(clus *Cluster) error {
return nil
}
func (f *failureAll) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
// failureUntilSnapshot injects a failure and waits for a snapshot event
type failureUntilSnapshot struct {
desc desc
failureCase rpcpb.FailureCase
Failure
}
const snapshotCount = 10000
func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
@ -190,7 +220,14 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
}
func (f *failureUntilSnapshot) Desc() string {
return f.Failure.Desc() + " for a long time and expect it to recover from an incoming snapshot"
if f.desc.Desc() != "" {
return f.desc.Desc()
}
return f.failureCase.String() + " (to trigger snapshot)"
}
func (f *failureUntilSnapshot) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
func killMap(size int, seed int) map[int]bool {
@ -204,3 +241,7 @@ func killMap(size int, seed int) map[int]bool {
}
}
}
type desc string
func (d desc) Desc() string { return string(d) }

View File

@ -17,12 +17,16 @@ package tester
import (
"fmt"
"os/exec"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
)
type failureExternal struct {
Failure
description string
desc string
failureCase rpcpb.FailureCase
scriptPath string
}
@ -34,11 +38,18 @@ func (f *failureExternal) Recover(clus *Cluster) error {
return exec.Command(f.scriptPath, "disable", fmt.Sprintf("%d", clus.rd)).Run()
}
func (f *failureExternal) Desc() string { return f.description }
func (f *failureExternal) Desc() string {
return f.desc
}
func (f *failureExternal) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
func newFailureExternal(scriptPath string) Failure {
return &failureExternal{
description: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
desc: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
failureCase: rpcpb.FailureCase_EXTERNAL,
scriptPath: scriptPath,
}
}

View File

@ -21,6 +21,8 @@ import (
"strings"
"sync"
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
)
type failpointStats struct {
@ -42,14 +44,23 @@ func failpointFailures(clus *Cluster) (ret []Failure, err error) {
if len(fp) == 0 {
continue
}
fpFails := failuresFromFailpoint(fp, clus.Tester.FailpointCommands)
// wrap in delays so failpoint has time to trigger
for i, fpf := range fpFails {
if strings.Contains(fp, "Snap") {
// hack to trigger snapshot failpoints
fpFails[i] = &failureUntilSnapshot{fpf}
fpFails[i] = &failureUntilSnapshot{
desc: desc(fpf.Desc()),
failureCase: rpcpb.FailureCase_FAILPOINTS,
Failure: fpf,
}
} else {
fpFails[i] = &failureDelay{fpf, 3 * time.Second}
fpFails[i] = &failureDelay{
Failure: fpf,
delayDuration: 3 * time.Second,
}
}
}
ret = append(ret, fpFails...)
@ -85,7 +96,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
fs = append(fs, []Failure{
&failureFollower{
failureByFunc: failureByFunc{
description: description(fmt.Sprintf("failpoint %s (one: %s)", fp, fcmd)),
desc: desc(fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd)),
failureCase: rpcpb.FailureCase_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
@ -94,7 +106,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
},
&failureLeader{
failureByFunc: failureByFunc{
description: description(fmt.Sprintf("failpoint %s (leader: %s)", fp, fcmd)),
desc: desc(fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd)),
failureCase: rpcpb.FailureCase_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
@ -102,12 +115,14 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
lead: -1,
},
&failureQuorum{
description: description(fmt.Sprintf("failpoint %s (quorum: %s)", fp, fcmd)),
desc: desc(fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd)),
failureCase: rpcpb.FailureCase_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
&failureAll{
description: description(fmt.Sprintf("failpoint %s (all: %s)", fp, fcmd)),
desc: desc(fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd)),
failureCase: rpcpb.FailureCase_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},

View File

@ -26,7 +26,7 @@ func recoverKill(clus *Cluster, idx int) error {
func newFailureKillOneFollower() Failure {
ff := failureByFunc{
description: "kill one follower",
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER,
injectMember: injectKill,
recoverMember: recoverKill,
}
@ -35,7 +35,7 @@ func newFailureKillOneFollower() Failure {
func newFailureKillLeader() Failure {
ff := failureByFunc{
description: "kill leader",
failureCase: rpcpb.FailureCase_KILL_LEADER,
injectMember: injectKill,
recoverMember: recoverKill,
}
@ -44,7 +44,7 @@ func newFailureKillLeader() Failure {
func newFailureKillQuorum() Failure {
return &failureQuorum{
description: "kill quorum",
failureCase: rpcpb.FailureCase_KILL_QUORUM,
injectMember: injectKill,
recoverMember: recoverKill,
}
@ -52,16 +52,22 @@ func newFailureKillQuorum() Failure {
func newFailureKillAll() Failure {
return &failureAll{
description: "kill all",
failureCase: rpcpb.FailureCase_KILL_ALL,
injectMember: injectKill,
recoverMember: recoverKill,
}
}
func newFailureKillOneFollowerForLongTime() Failure {
return &failureUntilSnapshot{newFailureKillOneFollower()}
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_FOR_LONG,
Failure: newFailureKillOneFollower(),
}
}
func newFailureKillLeaderForLongTime() Failure {
return &failureUntilSnapshot{newFailureKillLeader()}
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_KILL_LEADER_FOR_LONG,
Failure: newFailureKillLeader(),
}
}

View File

@ -26,7 +26,7 @@ func recoverBlackholePeerPortTxRx(clus *Cluster, idx int) error {
func newFailureBlackholePeerPortTxRxOneFollower() Failure {
ff := failureByFunc{
description: "blackhole peer port on one follower",
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
}
@ -39,7 +39,7 @@ func newFailureBlackholePeerPortTxRxOneFollower() Failure {
func newFailureBlackholePeerPortTxRxLeader() Failure {
ff := failureByFunc{
description: "blackhole peer port on leader",
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
}
@ -52,7 +52,7 @@ func newFailureBlackholePeerPortTxRxLeader() Failure {
func newFailureBlackholePeerPortTxRxAll() Failure {
f := &failureAll{
description: "blackhole peer port on all",
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
}

View File

@ -15,7 +15,6 @@
package tester
import (
"fmt"
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
@ -43,9 +42,8 @@ func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error {
}
func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
desc := fmt.Sprintf("delay follower peer port by %d ms", clus.Tester.DelayLatencyMs)
ff := failureByFunc{
description: description(desc),
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
@ -57,9 +55,8 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
}
func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
desc := fmt.Sprintf("delay leader peer port by %d ms", clus.Tester.DelayLatencyMs)
ff := failureByFunc{
description: description(desc),
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
@ -71,9 +68,8 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
}
func newFailureDelayPeerPortTxRxAll(clus *Cluster) Failure {
desc := fmt.Sprintf("delay all peer port by %d ms", clus.Tester.DelayLatencyMs)
f := &failureAll{
description: description(desc),
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}

View File

@ -14,13 +14,24 @@
package tester
import (
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
)
type failureNoOp failureByFunc
func (f *failureNoOp) Inject(clus *Cluster) error { return nil }
func (f *failureNoOp) Recover(clus *Cluster) error { return nil }
func (f *failureNoOp) FailureCase() rpcpb.FailureCase { return f.failureCase }
func newFailureNoOp() Failure {
return &failureNoOp{
description: "no failure",
f := &failureNoOp{
failureCase: rpcpb.FailureCase_NO_FAIL_WITH_STRESS,
}
return &failureDelay{
Failure: f,
delayDuration: 5 * time.Second,
}
}