Skip to content

Commit

Permalink
storage: retry splits more
Browse files Browse the repository at this point in the history
In tpccbench/nodes=9/cpu=4/multi-region, I consistently see the RESTORE
fail a split because the retry loop in executeAdminCommandWithDescriptor
interleaves with replicate queue activity until the retries are
exhausted. This is likely since due to the geo-replicated nature of the
cluster, the many steps involved in replicating the range take much
longer than we're used to; in the logs I can see the split attempt
and an interleaving replication change taking turns until the split
gives up. The solution is to just retry the split forever as long
as we're only seeing errors we know should eventually resolve.

The retry limit was introduced to fix cockroachdb#23310 but there is lots of
FUD around it. It's very likely that the problem encountered there
was really fixed in cockroachdb#23762, which added a lease check between the
split attempts.

Touches cockroachdb#41028.

Release justification: fixes roachtest failure, is low risk

Release note: None
  • Loading branch information
tbg committed Oct 8, 2019
1 parent 1c99165 commit 8f2a91e
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 76 deletions.
72 changes: 0 additions & 72 deletions pkg/storage/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1857,78 +1857,6 @@ func TestStoreSplitOnRemovedReplica(t *testing.T) {
}
}

// TestStoreSplitFailsAfterMaxRetries prevents regression of #23310. It
// ensures that an AdminSplit attempt will retry a limited number of times
// before returning unsuccessfully.
func TestStoreSplitFailsAfterMaxRetries(t *testing.T) {
defer leaktest.AfterTest(t)()
if testing.Short() {
// As of Nov 2018, this test takes 7s. This is because AdminSplit
// contains an exponential backoff when performing its retries.
// We could plumb a retry policy through the testing knobs, but
// that doesn't seem worth it for such a specific test. Instead,
// skip this test for the short test suite.
t.Skip("short")
}

leftKey := roachpb.Key("a")
splitKey := roachpb.Key("b")
rightKey := roachpb.Key("c")

var splitAttempts int64
filter := func(ba roachpb.BatchRequest) *roachpb.Error {
// Intercept and fail replica 1's attempt to perform the AdminSplit.
// We detect the split's range descriptor update and return an
// AmbiguousResultError, which is retried.
for _, req := range ba.Requests {
if cput, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
leftDescKey := keys.RangeDescriptorKey(roachpb.RKey(leftKey))
if cput.Key.Equal(leftDescKey) {
var desc roachpb.RangeDescriptor
if err := cput.Value.GetProto(&desc); err != nil {
panic(err)
}

if desc.EndKey.Equal(splitKey) {
atomic.AddInt64(&splitAttempts, 1)
return roachpb.NewError(&roachpb.AmbiguousResultError{})
}
}
}
}
return nil
}

var args base.TestClusterArgs
args.ReplicationMode = base.ReplicationManual
args.ServerArgs.Knobs.Store = &storage.StoreTestingKnobs{
TestingRequestFilter: filter,
}

tc := testcluster.StartTestCluster(t, 1, args)
defer tc.Stopper().Stop(context.TODO())

// Split the data range, mainly to avoid other splits getting in our way.
for _, k := range []roachpb.Key{leftKey, rightKey} {
if _, _, err := tc.SplitRange(k); err != nil {
t.Fatal(errors.Wrapf(err, "split at %s", k))
}
}

// Send an AdminSplit request to the replica. In the filter above we'll
// continue to return AmbiguousResultErrors. The split retry loop should
// retry a few times before exiting unsuccessfully.
_, _, err := tc.SplitRange(splitKey)
if !testutils.IsError(err, "split at key .* failed: result is ambiguous") {
t.Errorf("unexpected error from SplitRange: %+v", err)
}

const expAttempts = 11 // 10 retries
if splitAttempts != expAttempts {
t.Errorf("expected %d split attempts, found %d", expAttempts, splitAttempts)
}
}

func TestStoreSplitGCThreshold(t *testing.T) {
defer leaktest.AfterTest(t)()
storeCfg := storage.TestStoreConfig(nil)
Expand Down
11 changes: 7 additions & 4 deletions pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,9 +516,13 @@ func (r *Replica) adminUnsplitWithDescriptor(
func (r *Replica) executeAdminCommandWithDescriptor(
ctx context.Context, updateDesc func(*roachpb.RangeDescriptor) error,
) *roachpb.Error {
// Retry forever as long as we see errors we know will resolve.
retryOpts := base.DefaultRetryOptions()
retryOpts.MaxRetries = 10
var lastErr error
// Randomize quite a lot just in case someone else also interferes with us
// in a retry loop. Note that this is speculative; there wasn't an incident
// that suggested this.
retryOpts.RandomizationFactor = 0.5
lastErr := ctx.Err()
for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
// The replica may have been destroyed since the start of the retry loop.
// We need to explicitly check this condition. Having a valid lease, as we
Expand Down Expand Up @@ -552,10 +556,9 @@ func (r *Replica) executeAdminCommandWithDescriptor(
return false
}
}); !retry {
return roachpb.NewError(lastErr)
break
}
}
// If we broke out of the loop after MaxRetries, return the last error.
return roachpb.NewError(lastErr)
}

Expand Down

0 comments on commit 8f2a91e

Please sign in to comment.