From 8f2a91ea4c9ffda1237f5bf4e92f1aac0f15c376 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Tue, 8 Oct 2019 20:23:41 +0200
Subject: [PATCH] storage: retry splits more

In tpccbench/nodes=9/cpu=4/multi-region, I consistently see the RESTORE
fail a split because the retry loop in executeAdminCommandWithDescriptor
interleaves with replicate queue activity until the retries are
exhausted. This is likely since due to the geo-replicated nature of the
cluster, the many steps involved in replicating the range take much
longer than we're used to; in the logs I can see the split attempt
and an interleaving replication change taking turns until the split
gives up. The solution is to just retry the split forever as long
as we're only seeing errors we know should eventually resolve.

The retry limit was introduced to fix #23310 but there is lots of
FUD around it. It's very likely that the problem encountered there
was really fixed in #23762, which added a lease check between the
split attempts.

Touches #41028.

Release justification: fixes roachtest failure, is low risk

Release note: None
---
 pkg/storage/client_split_test.go | 72 --------------------------------
 pkg/storage/replica_command.go   | 11 +++--
 2 files changed, 7 insertions(+), 76 deletions(-)

diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
index f3cb67ba9fc8..eb90b3a6bf2d 100644
--- a/pkg/storage/client_split_test.go
+++ b/pkg/storage/client_split_test.go
@@ -1857,78 +1857,6 @@ func TestStoreSplitOnRemovedReplica(t *testing.T) {
 	}
 }
 
-// TestStoreSplitFailsAfterMaxRetries prevents regression of #23310. It
-// ensures that an AdminSplit attempt will retry a limited number of times
-// before returning unsuccessfully.
-func TestStoreSplitFailsAfterMaxRetries(t *testing.T) {
-	defer leaktest.AfterTest(t)()
-	if testing.Short() {
-		// As of Nov 2018, this test takes 7s. This is because AdminSplit
-		// contains an exponential backoff when performing its retries.
-		// We could plumb a retry policy through the testing knobs, but
-		// that doesn't seem worth it for such a specific test. Instead,
-		// skip this test for the short test suite.
-		t.Skip("short")
-	}
-
-	leftKey := roachpb.Key("a")
-	splitKey := roachpb.Key("b")
-	rightKey := roachpb.Key("c")
-
-	var splitAttempts int64
-	filter := func(ba roachpb.BatchRequest) *roachpb.Error {
-		// Intercept and fail replica 1's attempt to perform the AdminSplit.
-		// We detect the split's range descriptor update and return an
-		// AmbiguousResultError, which is retried.
-		for _, req := range ba.Requests {
-			if cput, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
-				leftDescKey := keys.RangeDescriptorKey(roachpb.RKey(leftKey))
-				if cput.Key.Equal(leftDescKey) {
-					var desc roachpb.RangeDescriptor
-					if err := cput.Value.GetProto(&desc); err != nil {
-						panic(err)
-					}
-
-					if desc.EndKey.Equal(splitKey) {
-						atomic.AddInt64(&splitAttempts, 1)
-						return roachpb.NewError(&roachpb.AmbiguousResultError{})
-					}
-				}
-			}
-		}
-		return nil
-	}
-
-	var args base.TestClusterArgs
-	args.ReplicationMode = base.ReplicationManual
-	args.ServerArgs.Knobs.Store = &storage.StoreTestingKnobs{
-		TestingRequestFilter: filter,
-	}
-
-	tc := testcluster.StartTestCluster(t, 1, args)
-	defer tc.Stopper().Stop(context.TODO())
-
-	// Split the data range, mainly to avoid other splits getting in our way.
-	for _, k := range []roachpb.Key{leftKey, rightKey} {
-		if _, _, err := tc.SplitRange(k); err != nil {
-			t.Fatal(errors.Wrapf(err, "split at %s", k))
-		}
-	}
-
-	// Send an AdminSplit request to the replica. In the filter above we'll
-	// continue to return AmbiguousResultErrors. The split retry loop should
-	// retry a few times before exiting unsuccessfully.
-	_, _, err := tc.SplitRange(splitKey)
-	if !testutils.IsError(err, "split at key .* failed: result is ambiguous") {
-		t.Errorf("unexpected error from SplitRange: %+v", err)
-	}
-
-	const expAttempts = 11 // 10 retries
-	if splitAttempts != expAttempts {
-		t.Errorf("expected %d split attempts, found %d", expAttempts, splitAttempts)
-	}
-}
-
 func TestStoreSplitGCThreshold(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	storeCfg := storage.TestStoreConfig(nil)
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
index e3b57fee7b16..6dd03c6a88f6 100644
--- a/pkg/storage/replica_command.go
+++ b/pkg/storage/replica_command.go
@@ -516,9 +516,13 @@ func (r *Replica) adminUnsplitWithDescriptor(
 func (r *Replica) executeAdminCommandWithDescriptor(
 	ctx context.Context, updateDesc func(*roachpb.RangeDescriptor) error,
 ) *roachpb.Error {
+	// Retry forever as long as we see errors we know will resolve.
 	retryOpts := base.DefaultRetryOptions()
-	retryOpts.MaxRetries = 10
-	var lastErr error
+	// Randomize quite a lot just in case someone else also interferes with us
+	// in a retry loop. Note that this is speculative; there wasn't an incident
+	// that suggested this.
+	retryOpts.RandomizationFactor = 0.5
+	lastErr := ctx.Err()
 	for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
 		// The replica may have been destroyed since the start of the retry loop.
 		// We need to explicitly check this condition. Having a valid lease, as we
@@ -552,10 +556,9 @@ func (r *Replica) executeAdminCommandWithDescriptor(
 				return false
 			}
 		}); !retry {
-			return roachpb.NewError(lastErr)
+			break
 		}
 	}
-	// If we broke out of the loop after MaxRetries, return the last error.
 	return roachpb.NewError(lastErr)
 }