etcd/tests/e2e/failover_test.go
Chao Chen ea035471ce online defrag notifies gRPC health server to expose NOT_SERVING status
Signed-off-by: Chao Chen <chaochn@amazon.com>
2023-10-25 08:58:33 -07:00

164 lines
5.2 KiB
Go

// Copyright 2023 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !cluster_proxy
package e2e
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
_ "google.golang.org/grpc/health"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/framework/config"
"go.etcd.io/etcd/tests/v3/framework/e2e"
)
const (
// in sync with how kubernetes uses etcd
// https://github.com/kubernetes/kubernetes/blob/release-1.28/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go#L59-L71
keepaliveTime = 30 * time.Second
keepaliveTimeout = 10 * time.Second
dialTimeout = 20 * time.Second
clientRuntime = 10 * time.Second
requestTimeout = 100 * time.Millisecond
)
func TestFailoverOnDefrag(t *testing.T) {
tcs := []struct {
name string
clusterOptions []e2e.EPClusterOption
gRPCDialOptions []grpc.DialOption
// common assertion
expectedMinTotalRequestsCount int
// happy case assertion
expectedMaxFailedRequestsCount int
// negative case assertion
expectedMinFailedRequestsCount int
}{
{
name: "defrag failover happy case",
clusterOptions: []e2e.EPClusterOption{
e2e.WithClusterSize(3),
e2e.WithExperimentalStopGRPCServiceOnDefrag(true),
e2e.WithGoFailEnabled(true),
},
gRPCDialOptions: []grpc.DialOption{
grpc.WithDisableServiceConfig(),
grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`),
},
expectedMinTotalRequestsCount: 300,
expectedMaxFailedRequestsCount: 5,
},
{
name: "defrag blocks one-third of requests with stopGRPCServiceOnDefrag set to false",
clusterOptions: []e2e.EPClusterOption{
e2e.WithClusterSize(3),
e2e.WithExperimentalStopGRPCServiceOnDefrag(false),
e2e.WithGoFailEnabled(true),
},
gRPCDialOptions: []grpc.DialOption{
grpc.WithDisableServiceConfig(),
grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`),
},
expectedMinTotalRequestsCount: 300,
expectedMinFailedRequestsCount: 90,
},
{
name: "defrag blocks one-third of requests with stopGRPCServiceOnDefrag set to true and client health check disabled",
clusterOptions: []e2e.EPClusterOption{
e2e.WithClusterSize(3),
e2e.WithExperimentalStopGRPCServiceOnDefrag(true),
e2e.WithGoFailEnabled(true),
},
expectedMinTotalRequestsCount: 300,
expectedMinFailedRequestsCount: 90,
},
}
for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
e2e.BeforeTest(t)
clus, cerr := e2e.NewEtcdProcessCluster(context.TODO(), t, tc.clusterOptions...)
require.NoError(t, cerr)
t.Cleanup(func() { clus.Stop() })
endpoints := clus.EndpointsGRPC()
requestVolume, successfulRequestCount := 0, 0
g := new(errgroup.Group)
g.Go(func() (lastErr error) {
clusterClient, cerr := clientv3.New(clientv3.Config{
DialTimeout: dialTimeout,
DialKeepAliveTime: keepaliveTime,
DialKeepAliveTimeout: keepaliveTimeout,
Endpoints: endpoints,
DialOptions: tc.gRPCDialOptions,
})
if cerr != nil {
return cerr
}
defer clusterClient.Close()
timeout := time.After(clientRuntime)
for {
select {
case <-timeout:
return lastErr
default:
}
getContext, cancel := context.WithTimeout(context.Background(), requestTimeout)
_, err := clusterClient.Get(getContext, "health")
cancel()
requestVolume++
if err != nil {
lastErr = err
continue
}
successfulRequestCount++
}
})
triggerDefrag(t, clus.Procs[0])
err := g.Wait()
if err != nil {
t.Logf("etcd client failed to fail over, error (%v)", err)
}
t.Logf("request failure rate is %.2f%%, traffic volume successfulRequestCount %d requests, total %d requests", (1-float64(successfulRequestCount)/float64(requestVolume))*100, successfulRequestCount, requestVolume)
require.GreaterOrEqual(t, requestVolume, tc.expectedMinTotalRequestsCount)
failedRequestCount := requestVolume - successfulRequestCount
if tc.expectedMaxFailedRequestsCount != 0 {
require.LessOrEqual(t, failedRequestCount, tc.expectedMaxFailedRequestsCount)
}
if tc.expectedMinFailedRequestsCount != 0 {
require.GreaterOrEqual(t, failedRequestCount, tc.expectedMinFailedRequestsCount)
}
})
}
}
func triggerDefrag(t *testing.T, member e2e.EtcdProcess) {
require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), "defragBeforeCopy", `sleep("10s")`))
require.NoError(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}))
}