clientv3: use grpc reconnection logic

This commit is contained in:
Anthony Romano
2016-06-07 17:03:48 -07:00
parent 1823702cc6
commit 62f8ec25c0
15 changed files with 171 additions and 594 deletions

View File

@ -23,13 +23,11 @@ import (
"net"
"net/url"
"strings"
"sync"
"time"
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
"golang.org/x/net/context"
"golang.org/x/time/rate"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/metadata"
@ -37,9 +35,6 @@ import (
var (
ErrNoAvailableEndpoints = errors.New("etcdclient: no available endpoints")
// minConnRetryWait is the minimum time between reconnects to avoid flooding
minConnRetryWait = time.Second
)
// Client provides and manages an etcd v3 client session.
@ -54,20 +49,10 @@ type Client struct {
conn *grpc.ClientConn
cfg Config
creds *credentials.TransportAuthenticator
mu sync.RWMutex // protects connection selection and error list
errors []error // errors passed to retryConnection
ctx context.Context
cancel context.CancelFunc
// fields below are managed by connMonitor
// reconnc accepts writes which signal the client should reconnect
reconnc chan error
// newconnc is closed on successful connect and set to a fresh channel
newconnc chan struct{}
lastConnErr error
// Username is a username for authentication
Username string
// Password is a password for authentication
@ -76,9 +61,6 @@ type Client struct {
// New creates a new etcdv3 client from a given configuration.
func New(cfg Config) (*Client, error) {
if cfg.retryDialer == nil {
cfg.retryDialer = dialEndpointList
}
if len(cfg.Endpoints) == 0 {
return nil, ErrNoAvailableEndpoints
}
@ -101,39 +83,9 @@ func NewFromConfigFile(path string) (*Client, error) {
}
// Close shuts down the client's etcd connections.
func (c *Client) Close() (err error) {
c.mu.Lock()
defer c.mu.Unlock()
// acquire the cancel
if c.cancel == nil {
// already canceled
if c.lastConnErr != c.ctx.Err() {
err = c.lastConnErr
}
return
}
cancel := c.cancel
c.cancel = nil
c.mu.Unlock()
// close watcher and lease before terminating connection
// so they don't retry on a closed client
c.Watcher.Close()
c.Lease.Close()
// cancel reconnection loop
cancel()
c.mu.Lock()
connc := c.newconnc
c.mu.Unlock()
// connc on cancel() is left closed
<-connc
c.mu.Lock()
if c.lastConnErr != c.ctx.Err() {
err = c.lastConnErr
}
return
func (c *Client) Close() error {
c.cancel()
return toErr(c.ctx, c.conn.Close())
}
// Ctx is a context for "out of band" messages (e.g., for sending
@ -144,15 +96,6 @@ func (c *Client) Ctx() context.Context { return c.ctx }
// Endpoints lists the registered endpoints for the client.
func (c *Client) Endpoints() []string { return c.cfg.Endpoints }
// Errors returns all errors that have been observed since called last.
func (c *Client) Errors() (errs []error) {
c.mu.Lock()
defer c.mu.Unlock()
errs = c.errors
c.errors = nil
return errs
}
type authTokenCredential struct {
token string
}
@ -241,7 +184,7 @@ func WithRequireLeader(ctx context.Context) context.Context {
func newClient(cfg *Config) (*Client, error) {
if cfg == nil {
cfg = &Config{retryDialer: dialEndpointList}
cfg = &Config{}
}
var creds *credentials.TransportAuthenticator
if cfg.TLS != nil {
@ -251,26 +194,23 @@ func newClient(cfg *Config) (*Client, error) {
// use a temporary skeleton client to bootstrap first connection
ctx, cancel := context.WithCancel(context.TODO())
conn, err := cfg.retryDialer(&Client{cfg: *cfg, creds: creds, ctx: ctx, Username: cfg.Username, Password: cfg.Password})
if err != nil {
return nil, err
}
client := &Client{
conn: conn,
cfg: *cfg,
creds: creds,
ctx: ctx,
cancel: cancel,
reconnc: make(chan error, 1),
newconnc: make(chan struct{}),
conn: nil,
cfg: *cfg,
creds: creds,
ctx: ctx,
cancel: cancel,
}
if cfg.Username != "" && cfg.Password != "" {
client.Username = cfg.Username
client.Password = cfg.Password
}
go client.connMonitor()
// TODO: use grpc balancer
conn, err := client.Dial(cfg.Endpoints[0])
if err != nil {
return nil, err
}
client.conn = conn
client.Cluster = NewCluster(client)
client.KV = NewKV(client)
@ -289,126 +229,30 @@ func newClient(cfg *Config) (*Client, error) {
}
// ActiveConnection returns the current in-use connection
func (c *Client) ActiveConnection() *grpc.ClientConn {
c.mu.RLock()
defer c.mu.RUnlock()
if c.conn == nil {
panic("trying to return nil active connection")
}
return c.conn
}
// retryConnection establishes a new connection
func (c *Client) retryConnection(err error) {
oldconn := c.conn
// return holding lock so old connection can be cleaned up in this defer
defer func() {
if oldconn != nil {
oldconn.Close()
if st, _ := oldconn.State(); st != grpc.Shutdown {
// wait so grpc doesn't leak sleeping goroutines
oldconn.WaitForStateChange(context.Background(), st)
}
}
c.mu.Unlock()
}()
c.mu.Lock()
if err != nil {
c.errors = append(c.errors, err)
}
if c.cancel == nil {
// client has called Close() so don't try to dial out
return
}
c.mu.Unlock()
nc, dialErr := c.cfg.retryDialer(c)
c.mu.Lock()
if nc != nil {
c.conn = nc
}
if dialErr != nil {
c.errors = append(c.errors, dialErr)
}
c.lastConnErr = dialErr
}
// connStartRetry schedules a reconnect if one is not already running
func (c *Client) connStartRetry(err error) {
c.mu.Lock()
ch := c.reconnc
defer c.mu.Unlock()
select {
case ch <- err:
default:
}
}
// connWait waits for a reconnect to be processed
func (c *Client) connWait(ctx context.Context, err error) (*grpc.ClientConn, error) {
c.mu.RLock()
ch := c.newconnc
c.mu.RUnlock()
c.connStartRetry(err)
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-ch:
}
c.mu.RLock()
defer c.mu.RUnlock()
if c.cancel == nil {
return c.conn, rpctypes.ErrConnClosed
}
return c.conn, c.lastConnErr
}
// connMonitor monitors the connection and handles retries
func (c *Client) connMonitor() {
var err error
defer func() {
c.retryConnection(c.ctx.Err())
close(c.newconnc)
}()
limiter := rate.NewLimiter(rate.Every(minConnRetryWait), 1)
for limiter.Wait(c.ctx) == nil {
select {
case err = <-c.reconnc:
case <-c.ctx.Done():
return
}
c.retryConnection(err)
c.mu.Lock()
close(c.newconnc)
c.newconnc = make(chan struct{})
c.reconnc = make(chan error, 1)
c.mu.Unlock()
}
}
// dialEndpointList attempts to connect to each endpoint in order until a
// connection is established.
func dialEndpointList(c *Client) (*grpc.ClientConn, error) {
var err error
for _, ep := range c.Endpoints() {
conn, curErr := c.Dial(ep)
if curErr != nil {
err = curErr
} else {
return conn, nil
}
}
return nil, err
}
func (c *Client) ActiveConnection() *grpc.ClientConn { return c.conn }
// isHaltErr returns true if the given error and context indicate no forward
// progress can be made, even after reconnecting.
func isHaltErr(ctx context.Context, err error) bool {
isRPCError := strings.HasPrefix(grpc.ErrorDesc(err), "etcdserver: ")
return isRPCError || ctx.Err() != nil || err == rpctypes.ErrConnClosed
if ctx != nil && ctx.Err() != nil {
return true
}
if err == nil {
return false
}
return strings.HasPrefix(grpc.ErrorDesc(err), "etcdserver: ") ||
strings.Contains(err.Error(), grpc.ErrClientConnClosing.Error())
}
func toErr(ctx context.Context, err error) error {
if err == nil {
return nil
}
err = rpctypes.Error(err)
if ctx.Err() != nil && strings.Contains(err.Error(), "context") {
err = ctx.Err()
} else if strings.Contains(err.Error(), grpc.ErrClientConnClosing.Error()) {
err = grpc.ErrClientConnClosing
}
return err
}