From dbe47e365d48bdb410bf2a24552a8b7431e2452b Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 12 Mar 2018 17:32:22 -0400
Subject: [PATCH 1/3] storage: add TestingRequestFilter testing knob

This change adds another testing filter that can be used to
influence the error returned from a request before it is evaluated.
Notably, the filter is run before the request is added to the
CommandQueue, so blocking in the filter will not block interfering
requests.

Release note: None
---
 pkg/storage/replica.go          | 6 ++++++
 pkg/storage/storagebase/base.go | 6 ++++++
 pkg/storage/store.go            | 6 ++++++
 3 files changed, 18 insertions(+)

diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
index 143c7cb56665..e102658e5a1b 100644
--- a/pkg/storage/replica.go
+++ b/pkg/storage/replica.go
@@ -1878,6 +1878,12 @@ func (r *Replica) Send(
 		return nil, roachpb.NewError(err)
 	}
 
+	if filter := r.store.cfg.TestingKnobs.TestingRequestFilter; filter != nil {
+		if pErr := filter(ba); pErr != nil {
+			return nil, pErr
+		}
+	}
+
 	// Differentiate between admin, read-only and write.
 	var pErr *roachpb.Error
 	if useRaft {
diff --git a/pkg/storage/storagebase/base.go b/pkg/storage/storagebase/base.go
index 2752bdbef670..7276478a74a4 100644
--- a/pkg/storage/storagebase/base.go
+++ b/pkg/storage/storagebase/base.go
@@ -55,6 +55,12 @@ func (f *FilterArgs) InRaftCmd() bool {
 	return f.CmdID != ""
 }
 
+// ReplicaRequestFilter can be used in testing to influence the error returned
+// from a request before it is evaluated. Notably, the filter is run before the
+// request is added to the CommandQueue, so blocking in the filter will not
+// block interfering requests.
+type ReplicaRequestFilter func(roachpb.BatchRequest) *roachpb.Error
+
 // ReplicaCommandFilter may be used in tests through the StoreTestingKnobs to
 // intercept the handling of commands and artificially generate errors. Return
 // nil to continue with regular processing or non-nil to terminate processing
diff --git a/pkg/storage/store.go b/pkg/storage/store.go
index 776ba581c974..3b2ad070a658 100644
--- a/pkg/storage/store.go
+++ b/pkg/storage/store.go
@@ -644,6 +644,12 @@ type StoreConfig struct {
 type StoreTestingKnobs struct {
 	EvalKnobs batcheval.TestingKnobs
 
+	// TestingRequestFilter is called before evaluating each command on a
+	// replica. The filter is run before the request is added to the
+	// CommandQueue, so blocking in the filter will not block interfering
+	// requests. If it returns an error, the command will not be evaluated.
+	TestingRequestFilter storagebase.ReplicaRequestFilter
+
 	// TestingProposalFilter is called before proposing each command.
 	TestingProposalFilter storagebase.ReplicaProposalFilter
 

From 44da6d449419a2f76558cb73aa1fb2c2b27b482e Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Tue, 13 Mar 2018 10:07:28 -0400
Subject: [PATCH 2/3] storage: return an error from split retry loop after
 MaxRetries

This loop used to return a `nil` error once the retry loop was
exhausted, which made it look like the split had succeeded. It
now returns an error.

Release note: None
---
 pkg/storage/client_split_test.go | 64 ++++++++++++++++++++++++++++++++
 pkg/storage/replica_command.go   | 12 ++++--
 2 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
index 27717152ea6c..d3055a0c8cf2 100644
--- a/pkg/storage/client_split_test.go
+++ b/pkg/storage/client_split_test.go
@@ -1490,6 +1490,70 @@ func TestStoreSplitTimestampCacheDifferentLeaseHolder(t *testing.T) {
 	}
 }
 
+// TestStoreSplitFailsAfterMaxRetries prevents regression of #23310. It
+// ensures that an AdminSplit attempt will retry a limited number of times
+// before returning unsuccessfully.
+func TestStoreSplitFailsAfterMaxRetries(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	leftKey := roachpb.Key("a")
+	splitKey := roachpb.Key("b")
+	rightKey := roachpb.Key("c")
+
+	var splitAttempts int64
+	filter := func(ba roachpb.BatchRequest) *roachpb.Error {
+		// Intercept and fail replica 1's attempt to perform the AdminSplit.
+		// We detect the split's range descriptor update and return an
+		// AmbiguousResultError, which is retried.
+		for _, req := range ba.Requests {
+			if cput, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
+				leftDescKey := keys.RangeDescriptorKey(roachpb.RKey(leftKey))
+				if cput.Key.Equal(leftDescKey) {
+					var desc roachpb.RangeDescriptor
+					if err := cput.Value.GetProto(&desc); err != nil {
+						panic(err)
+					}
+
+					if desc.EndKey.Equal(splitKey) {
+						atomic.AddInt64(&splitAttempts, 1)
+						return roachpb.NewError(&roachpb.AmbiguousResultError{})
+					}
+				}
+			}
+		}
+		return nil
+	}
+
+	var args base.TestClusterArgs
+	args.ReplicationMode = base.ReplicationManual
+	args.ServerArgs.Knobs.Store = &storage.StoreTestingKnobs{
+		TestingRequestFilter: filter,
+	}
+
+	tc := testcluster.StartTestCluster(t, 1, args)
+	defer tc.Stopper().Stop(context.TODO())
+
+	// Split the data range, mainly to avoid other splits getting in our way.
+	for _, k := range []roachpb.Key{leftKey, rightKey} {
+		if _, _, err := tc.SplitRange(k); err != nil {
+			t.Fatal(errors.Wrapf(err, "split at %s", k))
+		}
+	}
+
+	// Send an AdminSplit request to the replica. In the filter above we'll
+	// continue to return AmbiguousResultErrors. The split retry loop should
+	// retry a few times before exiting unsuccessfully.
+	_, _, err := tc.SplitRange(splitKey)
+	if !testutils.IsError(err, "split at key .* failed: result is ambiguous") {
+		t.Errorf("unexpected error from SplitRange: %v", err)
+	}
+
+	const expAttempts = 11 // 10 retries
+	if splitAttempts != expAttempts {
+		t.Errorf("expected %d split attempts, found %d", expAttempts, splitAttempts)
+	}
+}
+
 func TestStoreSplitGCThreshold(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	storeCfg := storage.TestStoreConfig(nil)
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
index bea905498448..028e0e4ba49d 100644
--- a/pkg/storage/replica_command.go
+++ b/pkg/storage/replica_command.go
@@ -705,12 +705,15 @@ func runCommitTrigger(
 // AdminSplit divides the range into into two ranges using args.SplitKey.
 func (r *Replica) AdminSplit(
 	ctx context.Context, args roachpb.AdminSplitRequest,
-) (roachpb.AdminSplitResponse, *roachpb.Error) {
+) (reply roachpb.AdminSplitResponse, pErr *roachpb.Error) {
 	if len(args.SplitKey) == 0 {
 		return roachpb.AdminSplitResponse{}, roachpb.NewErrorf("cannot split range with no key provided")
 	}
-	for retryable := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); retryable.Next(); {
-		reply, _, pErr := r.adminSplitWithDescriptor(ctx, args, r.Desc())
+
+	retryOpts := base.DefaultRetryOptions()
+	retryOpts.MaxRetries = 10
+	for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
+		reply, _, pErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
 		// On seeing a ConditionFailedError or an AmbiguousResultError, retry the
 		// command with the updated descriptor.
 		switch pErr.GetDetail().(type) {
@@ -720,7 +723,8 @@ func (r *Replica) AdminSplit(
 			return reply, pErr
 		}
 	}
-	return roachpb.AdminSplitResponse{}, roachpb.NewError(ctx.Err())
+	// If we broke out of the loop after MaxRetries, return the last error.
+	return roachpb.AdminSplitResponse{}, pErr
 }
 
 func maybeDescriptorChangedError(desc *roachpb.RangeDescriptor, err error) (string, bool) {

From 49301bef119fdcbff27fb4399c00baeb7c021c90 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 12 Mar 2018 17:34:28 -0400
Subject: [PATCH 3/3] storage: check leaseholder status in AdminSplit retry
 loop

Fixes #23673.

This change adds a call to `redirectOnOrAcquireLease` to the AdminSplit
retry loop. This ensures that each attempt to perform a split is done
by a leaseholder. Without this check, it was possible that a stale
follower or a replica that had already been removed from the range
could end up in an infinite retry loop. This was possible because
these replicas could have stale knowledge about the state of the
range descriptor. The effect of this was that the replicas' updates
to the local range descriptor key could indefinitely hit
`ConditionFailedErrors`, which are retried.

The change also adds a regression test that creates this
"replica removed while splitting" scenario. Without the leaseholder
check introduced here, the test would fail.

Now that the root cause of this infinite retry loop is fixed, we
should consider reverting #23310. This loop should not retry
indefinitely. If it does, exiting will just mask the issue.

Release note: None
---
 pkg/storage/client_split_test.go | 113 +++++++++++++++++++++++++++++++
 pkg/storage/replica_command.go   |   9 +++
 2 files changed, 122 insertions(+)

diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
index d3055a0c8cf2..1e2a1518bac1 100644
--- a/pkg/storage/client_split_test.go
+++ b/pkg/storage/client_split_test.go
@@ -1490,6 +1490,119 @@ func TestStoreSplitTimestampCacheDifferentLeaseHolder(t *testing.T) {
 	}
 }
 
+// TestStoreSplitOnRemovedReplica prevents regression of #23673. In that issue,
+// it was observed that the retry loop in AdminSplit could go into an infinite
+// loop if the replica it was being run on had been removed from the range. The
+// loop now checks that the replica performing the split is the leaseholder
+// before each iteration.
+func TestStoreSplitOnRemovedReplica(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	leftKey := roachpb.Key("a")
+	splitKey := roachpb.Key("b")
+	rightKey := roachpb.Key("c")
+
+	var newDesc roachpb.RangeDescriptor
+	inFilter := make(chan struct{}, 1)
+	beginBlockingSplit := make(chan struct{})
+	finishBlockingSplit := make(chan struct{})
+	filter := func(ba roachpb.BatchRequest) *roachpb.Error {
+		// Block replica 1's attempt to perform the AdminSplit. We detect the
+		// split's range descriptor update and block until the rest of the test
+		// is ready. We then return a ConditionFailedError, simulating a
+		// descriptor update race.
+		if ba.Replica.NodeID == 1 {
+			for _, req := range ba.Requests {
+				if cput, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
+					leftDescKey := keys.RangeDescriptorKey(roachpb.RKey(leftKey))
+					if cput.Key.Equal(leftDescKey) {
+						var desc roachpb.RangeDescriptor
+						if err := cput.Value.GetProto(&desc); err != nil {
+							panic(err)
+						}
+
+						if desc.EndKey.Equal(splitKey) {
+							select {
+							case <-beginBlockingSplit:
+								select {
+								case inFilter <- struct{}{}:
+									// Let the test know we're in the filter.
+								default:
+								}
+								<-finishBlockingSplit
+
+								var val roachpb.Value
+								if err := val.SetProto(&newDesc); err != nil {
+									panic(err)
+								}
+								return roachpb.NewError(&roachpb.ConditionFailedError{
+									ActualValue: &val,
+								})
+							default:
+							}
+						}
+					}
+				}
+			}
+		}
+		return nil
+	}
+
+	var args base.TestClusterArgs
+	args.ReplicationMode = base.ReplicationManual
+	args.ServerArgs.Knobs.Store = &storage.StoreTestingKnobs{
+		TestingRequestFilter: filter,
+	}
+
+	tc := testcluster.StartTestCluster(t, 3, args)
+	defer tc.Stopper().Stop(context.TODO())
+
+	// Split the data range, mainly to avoid other splits getting in our way.
+	for _, k := range []roachpb.Key{leftKey, rightKey} {
+		if _, _, err := tc.SplitRange(k); err != nil {
+			t.Fatal(errors.Wrapf(err, "split at %s", k))
+		}
+	}
+
+	// Send an AdminSplit request to the replica. In the filter above we'll
+	// block the first cput in this split until we're ready to let it loose
+	// again, which will be after we remove the replica from the range.
+	splitRes := make(chan error)
+	close(beginBlockingSplit)
+	go func() {
+		_, _, err := tc.SplitRange(splitKey)
+		splitRes <- err
+	}()
+	<-inFilter
+
+	// Move the range from node 0 to node 1. Then add node 2 to the range.
+	// node 0 will never hear about this range descriptor update.
+	var err error
+	if newDesc, err = tc.AddReplicas(leftKey, tc.Target(1)); err != nil {
+		t.Fatal(err)
+	}
+	if err := tc.TransferRangeLease(newDesc, tc.Target(1)); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := tc.RemoveReplicas(leftKey, tc.Target(0)); err != nil {
+		t.Fatal(err)
+	}
+	if newDesc, err = tc.AddReplicas(leftKey, tc.Target(2)); err != nil {
+		t.Fatal(err)
+	}
+
+	// Stop blocking the split request's cput. This will cause the cput to fail
+	// with a ConditionFailedError. The error will warrant a retry in
+	// AdminSplit's retry loop, but when the removed replica notices that it is
+	// no longer the leaseholder, it will return a NotLeaseholderError. This in
+	// turn will allow the AdminSplit to be re-routed to the new leaseholder,
+	// where it will succeed.
+	close(finishBlockingSplit)
+	if err = <-splitRes; err != nil {
+		t.Errorf("AdminSplit returned error: %v", err)
+	}
+}
+
 // TestStoreSplitFailsAfterMaxRetries prevents regression of #23310. It
 // ensures that an AdminSplit attempt will retry a limited number of times
 // before returning unsuccessfully.
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
index 028e0e4ba49d..afd0b19b12a7 100644
--- a/pkg/storage/replica_command.go
+++ b/pkg/storage/replica_command.go
@@ -713,6 +713,15 @@ func (r *Replica) AdminSplit(
 	retryOpts := base.DefaultRetryOptions()
 	retryOpts.MaxRetries = 10
 	for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
+		// Admin commands always require the range lease to begin (see
+		// executeAdminBatch), but we may have lost it while in this retry loop.
+		// Without the lease, a replica's local descriptor can be arbitrarily
+		// stale, which will result in a ConditionFailedError. To avoid this,
+		// we make sure that we still have the lease before each attempt.
+		if _, pErr = r.redirectOnOrAcquireLease(ctx); pErr != nil {
+			return roachpb.AdminSplitResponse{}, pErr
+		}
+
 		reply, _, pErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
 		// On seeing a ConditionFailedError or an AmbiguousResultError, retry the
 		// command with the updated descriptor.