From 04b0bcc06ccdc7ec4d9fe86fe159e0895aeb17ef Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Thu, 18 Apr 2019 17:57:30 -0400 Subject: [PATCH] storage: Don't swallow ErrEpochAlreadyIncremented IncrementEpoch was failing to distinguish between the request that caused the increment and another caller making the same increment. This is incorrect since a successful increment tells you *when* the node was confirmed dead, while relying on another node's increment leaves this uncertain. In rare cases involving a badly overloaded cluster, this could result in inconsistencies (non-serializable transactions) due to incorrect timestamp cache management. Fixes #35986 Release note (bug fix): Fix a rare inconsistency that could occur on badly overloaded clusters. --- pkg/storage/node_liveness.go | 12 +++++------ pkg/storage/node_liveness_test.go | 6 +++--- pkg/storage/replica_range_lease.go | 33 +++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/pkg/storage/node_liveness.go b/pkg/storage/node_liveness.go index 12076e6dcab5..e1c2f58aea26 100644 --- a/pkg/storage/node_liveness.go +++ b/pkg/storage/node_liveness.go @@ -53,6 +53,11 @@ var ( // the underlying liveness record has had its epoch incremented. ErrEpochIncremented = errors.New("heartbeat failed on epoch increment") + // ErrEpochAlreadyIncremented is returned by IncrementEpoch when + // someone else has already incremented the epoch to the desired + // value. + ErrEpochAlreadyIncremented = errors.New("epoch already incremented") + errLiveClockNotLive = errors.New("not live") ) @@ -685,8 +690,6 @@ func (nl *NodeLiveness) getLivenessLocked(nodeID roachpb.NodeID) (*storagepb.Liv return nil, ErrNoLivenessRecord } -var errEpochAlreadyIncremented = errors.New("epoch already incremented") - // IncrementEpoch is called to increment the current liveness epoch, // thereby invalidating anything relying on the liveness of the // previous epoch. This method does a conditional put on the node @@ -714,15 +717,12 @@ func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *storagepb. if err := nl.updateLiveness(ctx, update, liveness, func(actual storagepb.Liveness) error { defer nl.maybeUpdate(actual) if actual.Epoch > liveness.Epoch { - return errEpochAlreadyIncremented + return ErrEpochAlreadyIncremented } else if actual.Epoch < liveness.Epoch { return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch) } return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", *liveness, actual) }); err != nil { - if err == errEpochAlreadyIncremented { - return nil - } return err } diff --git a/pkg/storage/node_liveness_test.go b/pkg/storage/node_liveness_test.go index 9edc87a230ad..94f221e10093 100644 --- a/pkg/storage/node_liveness_test.go +++ b/pkg/storage/node_liveness_test.go @@ -316,8 +316,8 @@ func TestNodeLivenessEpochIncrement(t *testing.T) { t.Errorf("expected epoch increment == 1; got %d", c) } - // Verify noop on incrementing an already-incremented epoch. - if err := mtc.nodeLivenesses[0].IncrementEpoch(context.Background(), oldLiveness); err != nil { + // Verify error on incrementing an already-incremented epoch. + if err := mtc.nodeLivenesses[0].IncrementEpoch(context.Background(), oldLiveness); err != storage.ErrEpochAlreadyIncremented { t.Fatalf("unexpected error incrementing a non-live node: %s", err) } @@ -610,7 +610,7 @@ func TestNodeLivenessConcurrentIncrementEpochs(t *testing.T) { }() } for i := 0; i < concurrency; i++ { - if err := <-errCh; err != nil { + if err := <-errCh; err != nil && err != storage.ErrEpochAlreadyIncremented { t.Fatalf("concurrent increment epoch %d failed: %s", i, err) } } diff --git a/pkg/storage/replica_range_lease.go b/pkg/storage/replica_range_lease.go index 4f78b6d3df2b..35585fd4cc95 100644 --- a/pkg/storage/replica_range_lease.go +++ b/pkg/storage/replica_range_lease.go @@ -303,7 +303,38 @@ func (p *pendingLeaseRequest) requestLeaseAsync( log.Info(ctx, err) } } else if err = p.repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.Liveness); err != nil { - log.Error(ctx, err) + // If we get ErrEpochAlreadyIncremented, someone else beat + // us to it. This proves that the target node is truly + // dead *now*, but it doesn't prove that it was dead at + // status.Timestamp (which we've encoded into our lease + // request). It's possible that the node was temporarily + // considered dead but revived without having its epoch + // incremented, i.e. that it was in fact live at + // status.Timestamp. + // + // It would be incorrect to simply proceed to sending our + // lease request since our lease.Start may precede the + // effective end timestamp of the predecessor lease (the + // expiration of the last successful heartbeat before the + // epoch increment), and so under this lease this node's + // timestamp cache would not necessarily reflect all reads + // served by the prior leaseholder. + // + // It would be correct to bump the timestamp in the lease + // request and proceed, but that just sets up another race + // between this node and the one that already incremented + // the epoch. They're probably going to beat us this time + // too, so just return the NotLeaseHolderError here + // instead of trying to fix up the timestamps and submit + // the lease request. + // + // ErrEpochAlreadyIncremented is not an unusual situation, + // so we don't log it as an error. + // + // https://github.com/cockroachdb/cockroach/issues/35986 + if err != ErrEpochAlreadyIncremented { + log.Error(ctx, err) + } } } // Set error for propagation to all waiters below.