storage: retry splits more

In tpccbench/nodes=9/cpu=4/multi-region, I consistently see the RESTORE fail a split because the retry loop in executeAdminCommandWithDescriptor interleaves with replicate queue activity until the retries are exhausted. This is likely since due to the geo-replicated nature of the cluster, the many steps involved in replicating the range take much longer than we're used to; in the logs I can see the split attempt and an interleaving replication change taking turns until the split gives up. The solution is to just retry the split forever as long as we're only seeing errors we know should eventually resolve. The retry limit was introduced to fix cockroachdb#23310 but there is lots of FUD around it. It's very likely that the problem encountered there was really fixed in cockroachdb#23762, which added a lease check between the split attempts. Touches cockroachdb#41028. Release justification: fixes roachtest failure, is low risk Release note: None
tbg · Oct 8, 2019 · 8f2a91e · 8f2a91e
1 parent 1c99165
commit 8f2a91e
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 76 deletions.
diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
@@ -1857,78 +1857,6 @@ func TestStoreSplitOnRemovedReplica(t *testing.T) {
 	}
 }
 
-// TestStoreSplitFailsAfterMaxRetries prevents regression of #23310. It
-// ensures that an AdminSplit attempt will retry a limited number of times
-// before returning unsuccessfully.
-func TestStoreSplitFailsAfterMaxRetries(t *testing.T) {
-	defer leaktest.AfterTest(t)()
-	if testing.Short() {
-		// As of Nov 2018, this test takes 7s. This is because AdminSplit
-		// contains an exponential backoff when performing its retries.
-		// We could plumb a retry policy through the testing knobs, but
-		// that doesn't seem worth it for such a specific test. Instead,
-		// skip this test for the short test suite.
-		t.Skip("short")
-	}
-
-	leftKey := roachpb.Key("a")
-	splitKey := roachpb.Key("b")
-	rightKey := roachpb.Key("c")
-
-	var splitAttempts int64
-	filter := func(ba roachpb.BatchRequest) *roachpb.Error {
-		// Intercept and fail replica 1's attempt to perform the AdminSplit.
-		// We detect the split's range descriptor update and return an
-		// AmbiguousResultError, which is retried.
-		for _, req := range ba.Requests {
-			if cput, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
-				leftDescKey := keys.RangeDescriptorKey(roachpb.RKey(leftKey))
-				if cput.Key.Equal(leftDescKey) {
-					var desc roachpb.RangeDescriptor
-					if err := cput.Value.GetProto(&desc); err != nil {
-						panic(err)
-					}
-
-					if desc.EndKey.Equal(splitKey) {
-						atomic.AddInt64(&splitAttempts, 1)
-						return roachpb.NewError(&roachpb.AmbiguousResultError{})
-					}
-				}
-			}
-		}
-		return nil
-	}
-
-	var args base.TestClusterArgs
-	args.ReplicationMode = base.ReplicationManual
-	args.ServerArgs.Knobs.Store = &storage.StoreTestingKnobs{
-		TestingRequestFilter: filter,
-	}
-
-	tc := testcluster.StartTestCluster(t, 1, args)
-	defer tc.Stopper().Stop(context.TODO())
-
-	// Split the data range, mainly to avoid other splits getting in our way.
-	for _, k := range []roachpb.Key{leftKey, rightKey} {
-		if _, _, err := tc.SplitRange(k); err != nil {
-			t.Fatal(errors.Wrapf(err, "split at %s", k))
-		}
-	}
-
-	// Send an AdminSplit request to the replica. In the filter above we'll
-	// continue to return AmbiguousResultErrors. The split retry loop should
-	// retry a few times before exiting unsuccessfully.
-	_, _, err := tc.SplitRange(splitKey)
-	if !testutils.IsError(err, "split at key .* failed: result is ambiguous") {
-		t.Errorf("unexpected error from SplitRange: %+v", err)
-	}
-
-	const expAttempts = 11 // 10 retries
-	if splitAttempts != expAttempts {
-		t.Errorf("expected %d split attempts, found %d", expAttempts, splitAttempts)
-	}
-}
-
 func TestStoreSplitGCThreshold(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	storeCfg := storage.TestStoreConfig(nil)

diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
@@ -516,9 +516,13 @@ func (r *Replica) adminUnsplitWithDescriptor(
 func (r *Replica) executeAdminCommandWithDescriptor(
 	ctx context.Context, updateDesc func(*roachpb.RangeDescriptor) error,
 ) *roachpb.Error {
+	// Retry forever as long as we see errors we know will resolve.
 	retryOpts := base.DefaultRetryOptions()
-	retryOpts.MaxRetries = 10
-	var lastErr error
+	// Randomize quite a lot just in case someone else also interferes with us
+	// in a retry loop. Note that this is speculative; there wasn't an incident
+	// that suggested this.
+	retryOpts.RandomizationFactor = 0.5
+	lastErr := ctx.Err()
 	for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
 		// The replica may have been destroyed since the start of the retry loop.
 		// We need to explicitly check this condition. Having a valid lease, as we
@@ -552,10 +556,9 @@ func (r *Replica) executeAdminCommandWithDescriptor(
 				return false
 			}
 		}); !retry {
-			return roachpb.NewError(lastErr)
+			break
 		}
 	}
-	// If we broke out of the loop after MaxRetries, return the last error.
 	return roachpb.NewError(lastErr)
 }