From 87b1d9571f31c39c428efa5dbb7a7fe56b421a52 Mon Sep 17 00:00:00 2001 From: Anthony Romano Date: Fri, 16 Dec 2016 00:21:07 -0800 Subject: [PATCH 1/3] v3api, rpctypes: add ErrTimeoutDueToConnectionLost Lack of GRPC code was causing this to look like a halting error to the client. --- etcdserver/api/v3rpc/rpctypes/error.go | 39 ++++++++++++++------------ etcdserver/api/v3rpc/util.go | 2 ++ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/etcdserver/api/v3rpc/rpctypes/error.go b/etcdserver/api/v3rpc/rpctypes/error.go index 96905ff6561..1831516f444 100644 --- a/etcdserver/api/v3rpc/rpctypes/error.go +++ b/etcdserver/api/v3rpc/rpctypes/error.go @@ -52,12 +52,13 @@ var ( ErrGRPCPermissionNotGranted = grpc.Errorf(codes.FailedPrecondition, "etcdserver: permission is not granted to the role") ErrGRPCAuthNotEnabled = grpc.Errorf(codes.FailedPrecondition, "etcdserver: authentication is not enabled") - ErrGRPCNoLeader = grpc.Errorf(codes.Unavailable, "etcdserver: no leader") - ErrGRPCNotCapable = grpc.Errorf(codes.Unavailable, "etcdserver: not capable") - ErrGRPCStopped = grpc.Errorf(codes.Unavailable, "etcdserver: server stopped") - ErrGRPCTimeout = grpc.Errorf(codes.Unavailable, "etcdserver: request timed out") - ErrGRPCTimeoutDueToLeaderFail = grpc.Errorf(codes.Unavailable, "etcdserver: request timed out, possibly due to previous leader failure") - ErrGRPCUnhealthy = grpc.Errorf(codes.Unavailable, "etcdserver: unhealthy cluster") + ErrGRPCNoLeader = grpc.Errorf(codes.Unavailable, "etcdserver: no leader") + ErrGRPCNotCapable = grpc.Errorf(codes.Unavailable, "etcdserver: not capable") + ErrGRPCStopped = grpc.Errorf(codes.Unavailable, "etcdserver: server stopped") + ErrGRPCTimeout = grpc.Errorf(codes.Unavailable, "etcdserver: request timed out") + ErrGRPCTimeoutDueToLeaderFail = grpc.Errorf(codes.Unavailable, "etcdserver: request timed out, possibly due to previous leader failure") + ErrGRPCTimeoutDueToConnectionLost = grpc.Errorf(codes.Unavailable, "etcdserver: request timed out, possibly due to connection lost") + ErrGRPCUnhealthy = grpc.Errorf(codes.Unavailable, "etcdserver: unhealthy cluster") errStringToError = map[string]error{ grpc.ErrorDesc(ErrGRPCEmptyKey): ErrGRPCEmptyKey, @@ -91,12 +92,13 @@ var ( grpc.ErrorDesc(ErrGRPCPermissionNotGranted): ErrGRPCPermissionNotGranted, grpc.ErrorDesc(ErrGRPCAuthNotEnabled): ErrGRPCAuthNotEnabled, - grpc.ErrorDesc(ErrGRPCNoLeader): ErrGRPCNoLeader, - grpc.ErrorDesc(ErrGRPCNotCapable): ErrGRPCNotCapable, - grpc.ErrorDesc(ErrGRPCStopped): ErrGRPCStopped, - grpc.ErrorDesc(ErrGRPCTimeout): ErrGRPCTimeout, - grpc.ErrorDesc(ErrGRPCTimeoutDueToLeaderFail): ErrGRPCTimeoutDueToLeaderFail, - grpc.ErrorDesc(ErrGRPCUnhealthy): ErrGRPCUnhealthy, + grpc.ErrorDesc(ErrGRPCNoLeader): ErrGRPCNoLeader, + grpc.ErrorDesc(ErrGRPCNotCapable): ErrGRPCNotCapable, + grpc.ErrorDesc(ErrGRPCStopped): ErrGRPCStopped, + grpc.ErrorDesc(ErrGRPCTimeout): ErrGRPCTimeout, + grpc.ErrorDesc(ErrGRPCTimeoutDueToLeaderFail): ErrGRPCTimeoutDueToLeaderFail, + grpc.ErrorDesc(ErrGRPCTimeoutDueToConnectionLost): ErrGRPCTimeoutDueToConnectionLost, + grpc.ErrorDesc(ErrGRPCUnhealthy): ErrGRPCUnhealthy, } // client-side error @@ -131,12 +133,13 @@ var ( ErrPermissionNotGranted = Error(ErrGRPCPermissionNotGranted) ErrAuthNotEnabled = Error(ErrGRPCAuthNotEnabled) - ErrNoLeader = Error(ErrGRPCNoLeader) - ErrNotCapable = Error(ErrGRPCNotCapable) - ErrStopped = Error(ErrGRPCStopped) - ErrTimeout = Error(ErrGRPCTimeout) - ErrTimeoutDueToLeaderFail = Error(ErrGRPCTimeoutDueToLeaderFail) - ErrUnhealthy = Error(ErrGRPCUnhealthy) + ErrNoLeader = Error(ErrGRPCNoLeader) + ErrNotCapable = Error(ErrGRPCNotCapable) + ErrStopped = Error(ErrGRPCStopped) + ErrTimeout = Error(ErrGRPCTimeout) + ErrTimeoutDueToLeaderFail = Error(ErrGRPCTimeoutDueToLeaderFail) + ErrTimeoutDueToConnectionLost = Error(ErrGRPCTimeoutDueToConnectionLost) + ErrUnhealthy = Error(ErrGRPCUnhealthy) ) // EtcdError defines gRPC server errors. diff --git a/etcdserver/api/v3rpc/util.go b/etcdserver/api/v3rpc/util.go index 5c74e46dcb7..23586aa36a1 100644 --- a/etcdserver/api/v3rpc/util.go +++ b/etcdserver/api/v3rpc/util.go @@ -57,6 +57,8 @@ func togRPCError(err error) error { return rpctypes.ErrGRPCTimeout case etcdserver.ErrTimeoutDueToLeaderFail: return rpctypes.ErrGRPCTimeoutDueToLeaderFail + case etcdserver.ErrTimeoutDueToConnectionLost: + return rpctypes.ErrGRPCTimeoutDueToConnectionLost case etcdserver.ErrUnhealthy: return rpctypes.ErrGRPCUnhealthy From 46bd842db9fbc05ab1a1f80a5ffc922a3bbb5127 Mon Sep 17 00:00:00 2001 From: Anthony Romano Date: Fri, 16 Dec 2016 00:23:49 -0800 Subject: [PATCH 2/3] clientv3/integration: test lease grant/keepalive with/without failures --- clientv3/integration/lease_test.go | 56 ++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/clientv3/integration/lease_test.go b/clientv3/integration/lease_test.go index 82cd8a76726..8603077829a 100644 --- a/clientv3/integration/lease_test.go +++ b/clientv3/integration/lease_test.go @@ -17,10 +17,12 @@ package integration import ( "reflect" "sort" + "sync" "testing" "time" "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/clientv3/concurrency" "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes" "github.com/coreos/etcd/integration" "github.com/coreos/etcd/pkg/testutil" @@ -574,3 +576,57 @@ func TestLeaseKeepAliveLoopExit(t *testing.T) { t.Fatalf("expected %T, got %v(%T)", clientv3.ErrKeepAliveHalted{}, err, err) } } + +// TestV3LeaseFailureOverlap issues Grant and Keepalive requests to a cluster +// before, during, and after quorum loss to confirm Grant/Keepalive tolerates +// transient cluster failure. +func TestV3LeaseFailureOverlap(t *testing.T) { + clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 2}) + defer clus.Terminate(t) + + numReqs := 5 + cli := clus.Client(0) + + // bring up a session, tear it down + updown := func(i int) error { + sess, err := concurrency.NewSession(cli) + if err != nil { + return err + } + ch := make(chan struct{}) + go func() { + defer close(ch) + sess.Close() + }() + select { + case <-ch: + case <-time.After(time.Minute / 4): + t.Fatalf("timeout %d", i) + } + return nil + } + + var wg sync.WaitGroup + mkReqs := func(n int) { + wg.Add(numReqs) + for i := 0; i < numReqs; i++ { + go func() { + defer wg.Done() + err := updown(n) + if err == nil || err == rpctypes.ErrTimeoutDueToConnectionLost { + return + } + t.Fatal(err) + }() + } + } + + mkReqs(1) + clus.Members[1].Stop(t) + mkReqs(2) + time.Sleep(time.Second) + mkReqs(3) + clus.Members[1].Restart(t) + mkReqs(4) + wg.Wait() +} From a375e91c66561b0a3123cffc77c91321c6d08dfd Mon Sep 17 00:00:00 2001 From: Anthony Romano Date: Fri, 16 Dec 2016 00:28:17 -0800 Subject: [PATCH 3/3] clientv3: don't reset keepalive stream on grant failure Was triggering cancelation errors on outstanding KeepAlives if Grant had to retry. --- clientv3/lease.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/clientv3/lease.go b/clientv3/lease.go index 81c9c607235..b1e155919ec 100644 --- a/clientv3/lease.go +++ b/clientv3/lease.go @@ -177,9 +177,6 @@ func (l *lessor) Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, err if isHaltErr(cctx, err) { return nil, toErr(cctx, err) } - if nerr := l.newStream(); nerr != nil { - return nil, nerr - } } }