From 71f0b3466dbbc6f8fc751e846ca85a87da7410da Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 24 Jan 2022 13:34:32 +0100 Subject: [PATCH] kvserver: use Background() in computeChecksumPostApply goroutine On the leaseholder, `ctx` passed to `computeChecksumPostApply` is that of the proposal. As of #71806, this context is canceled right after the corresponding proposal is signaled (and the client goroutine returns from `sendWithRangeID`). This effectively prevents most consistency checks from succeeding (they previously were not affected by higher-level cancellation because the consistency check is triggered from a local queue that talks directly to the replica, i.e. had something like a minutes-long timeout). This caused disastrous behavior in the `clearrange` suite of roachtests. That test imports a large table. After the import, most ranges have estimates (due to the ctx cancellation preventing the consistency checks, which as a byproduct trigger stats adjustments) and their stats claim that they are very small. Before recent PR #74674, `ClearRange` on such ranges would use individual point deletions instead of the much more efficient pebble range deletions, effectively writing a lot of data and running the nodes out of disk. Failures of `clearrange` with #74674 were also observed, but they did not involve out-of-disk situations, so are possibly an alternative failure mode (that may still be related to the newly introduced presence of context cancellation). Touches #68303. Release note: None --- pkg/kv/kvserver/replica_proposal.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index 8bdd0bce36f4..3179854dd893 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -236,7 +236,9 @@ func (r *Replica) computeChecksumPostApply(ctx context.Context, cc kvserverpb.Co } // Compute SHA asynchronously and store it in a map by UUID. - if err := stopper.RunAsyncTask(ctx, "storage.Replica: computing checksum", func(ctx context.Context) { + // Don't use the proposal's context for this, as it likely to be canceled very + // soon. + if err := stopper.RunAsyncTask(r.AnnotateCtx(context.Background()), "storage.Replica: computing checksum", func(ctx context.Context) { func() { defer snap.Close() var snapshot *roachpb.RaftSnapshotData @@ -504,7 +506,7 @@ func (r *Replica) leasePostApplyLocked( if iAmTheLeaseHolder { // NB: run these in an async task to keep them out of the critical section // (r.mu is held here). - _ = r.store.stopper.RunAsyncTask(ctx, "lease-triggers", func(ctx context.Context) { + _ = r.store.stopper.RunAsyncTask(r.AnnotateCtx(context.Background()), "lease-triggers", func(ctx context.Context) { // Re-acquire the raftMu, as we are now in an async task. r.raftMu.Lock() defer r.raftMu.Unlock()