Skip to content

Commit

Permalink
Merge #110078
Browse files Browse the repository at this point in the history
110078: kvserver: propagate GCHint for partial deletions r=erikgrinaker a=pavelkalinnikov

Previously, `GCHint` was used for 3 purposes:
1. Instruct GC to run at specific times. This is used by SQL table/index drop jobs, to limit the amount of time they wait on data GC.
2. Hint GC that a range is fully covered by range tombstones, so that GC can use `ClearRange` for faster deletes. This is useful for bulk data deletions as well.
3. Hint GC that a range was fully covered by range tombstones at some recent point. GC prioritizes going through such ranges, and processes them together, because this makes Pebble compaction more efficient.

The use-case 1 was broken. If a range tombstone does not fully cover the `Range` keyspace, `GCHint` is not written. Correspondingly, for small table/index deletions (spanning less than a single range), or deletions that don't perfectly match range bounds, there will be some ranges with no hints. For these ranges, GC might be arbitrarily delayed, and the schema change jobs would be stuck.

In addition, `GCHint` propagation during merges was lossy. It could drop the hint if either LHS or RHS doesn't have a hint, or has some data.

This commit extends `GCHint` to fix the use-case 1. Two fields are added: `min` and `max` deletion timestamp. These fields are populated even for partial range deletions, and are propagated during splits and merges unconditionally. The choice of min-max approach is explained in the comments to the corresponding `GCHint` fields.

Fixes #107931

Release note (bug fix): Fixed a bug that could occasionally cause schema change jobs (e.g. table/index drops) to appear stuck in state "waiting for MVCC GC" for much longer than expected. The fix only applies to future schema changes -- existing stuck jobs can be processed by manually force-enqueueing the relevant ranges in the MVCC GC queue under the DB Console's advanced debug page.

Epic: none

Co-authored-by: Pavel Kalinnikov <[email protected]>
  • Loading branch information
craig[bot] and pav-kv committed Sep 14, 2023
2 parents 82ea947 + e28e450 commit 4518a99
Show file tree
Hide file tree
Showing 9 changed files with 497 additions and 87 deletions.
7 changes: 4 additions & 3 deletions pkg/kv/kvpb/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,10 @@ message DeleteRangeRequest {
// considers empty spans equivalent to being covered by an MVCC range
// tombstone, so it will omit the write across an entirely empty span too.
bool idempotent_tombstone = 7;
// If enabled and DeleteRangeRequest is deleting the whole range with a range
// tombstone, GCHint on replica will be updated to notify mvcc gc queue
// that this range would benefit from optimized GC process.
// If enabled, and a span is deleted using a range tombstone, then the GCHint
// on the corresponding Range will be updated. The hint instructs MVCC GC
// queue to delete this data as soon as it can, and helps optimizing GC for
// bulk deletions.
bool update_range_delete_gc_hint = 8 [(gogoproto.customname) = "UpdateRangeDeleteGCHint"];

DeleteRangePredicates predicates = 6 [(gogoproto.nullable) = false];
Expand Down
19 changes: 11 additions & 8 deletions pkg/kv/kvserver/batcheval/cmd_delete_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,24 @@ func DeleteRange(
if !args.UpdateRangeDeleteGCHint {
return nil
}
// If GCHint was provided, then we need to check if this request meets
// range gc criteria of removing all data. This is not an error as range
// might have merged since request was sent and we don't want to fail
// deletion.
if !args.Key.Equal(desc.StartKey.AsRawKey()) || !args.EndKey.Equal(desc.EndKey.AsRawKey()) {
return nil
}
sl := MakeStateLoader(cArgs.EvalCtx)
hint, err := sl.LoadGCHint(ctx, readWriter)
if err != nil {
return err
}
if !hint.ForwardLatestRangeDeleteTimestamp(h.Timestamp) {

// Add the timestamp to GCHint to guarantee that GC eventually clears it.
updated := hint.ScheduleGCFor(h.Timestamp)
// If the range tombstone covers the whole Range key span, update the
// corresponding timestamp in GCHint to enable ClearRange optimization.
if args.Key.Equal(desc.StartKey.AsRawKey()) && args.EndKey.Equal(desc.EndKey.AsRawKey()) {
// NB: don't swap the order, we want to call the method unconditionally.
updated = hint.ForwardLatestRangeDeleteTimestamp(h.Timestamp) || updated
}
if !updated {
return nil
}

if updated, err := sl.SetGCHint(ctx, readWriter, cArgs.Stats, hint); err != nil || !updated {
return err
}
Expand Down
21 changes: 9 additions & 12 deletions pkg/kv/kvserver/batcheval/cmd_gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,18 +277,15 @@ func GC(
if err != nil {
return result.Result{}, err
}
if !hint.IsEmpty() {
if hint.LatestRangeDeleteTimestamp.LessEq(gcThreshold) {
hint.ResetLatestRangeDeleteTimestamp()
// NB: Replicated.State can already contain GCThreshold from above. Make
// sure we don't accidentally remove it.
if res.Replicated.State == nil {
res.Replicated.State = &kvserverpb.ReplicaState{}
}
res.Replicated.State.GCHint = hint
if _, err := sl.SetGCHint(ctx, readWriter, cArgs.Stats, hint); err != nil {
return result.Result{}, err
}
if hint.UpdateAfterGC(gcThreshold) {
// NB: Replicated.State can already contain GCThreshold from above. Make
// sure we don't accidentally remove it.
if res.Replicated.State == nil {
res.Replicated.State = &kvserverpb.ReplicaState{}
}
res.Replicated.State.GCHint = hint
if _, err := sl.SetGCHint(ctx, readWriter, cArgs.Stats, hint); err != nil {
return result.Result{}, err
}
}
}
Expand Down
106 changes: 68 additions & 38 deletions pkg/kv/kvserver/client_merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5251,42 +5251,70 @@ func TestStoreMergeGCHint(t *testing.T) {
name string
dataLeft, dataRight bool
delLeft, delRight bool
expectHint bool
}{
{
name: "merge similar ranges",
dataLeft: true,
dataRight: true,
delLeft: true,
delRight: true,
expectHint: true,
},
{
name: "merge with data on right",
dataLeft: true,
dataRight: true,
delLeft: true,
expectHint: false,
},
{
name: "merge with data on left",
dataLeft: true,
dataRight: true,
delRight: true,
expectHint: false,
},
{
name: "merge with empty on left",
dataRight: true,
delRight: true,
expectHint: true,
},
{
name: "merge with empty on right",
dataLeft: true,
delLeft: true,
expectHint: true,
},

wantRangeDelete bool // the hint must indicate a whole range deletion
wantGCTimestamp bool // the hint must indicate a pending GC timestamp
}{{
name: "merge empty ranges",
dataLeft: false, dataRight: false,
delLeft: false, delRight: false,
wantRangeDelete: false,
wantGCTimestamp: false,
}, {
name: "merge after deleting empty left",
dataLeft: false, dataRight: false,
delLeft: true, delRight: false,
wantRangeDelete: true,
wantGCTimestamp: true,
}, {
name: "merge after deleting empty right",
dataLeft: false, dataRight: false,
delLeft: false, delRight: true,
wantRangeDelete: true,
wantGCTimestamp: true,
}, {
name: "merge after deleting both empty",
dataLeft: false, dataRight: false,
delLeft: true, delRight: true,
wantRangeDelete: true,
wantGCTimestamp: true,
}, {
name: "merge similar ranges",
dataLeft: true, dataRight: true,
delLeft: true, delRight: true,
wantRangeDelete: true,
wantGCTimestamp: true,
}, {
name: "merge with data on right",
dataLeft: true, dataRight: true,
delLeft: true, delRight: false,
wantRangeDelete: false,
wantGCTimestamp: true,
}, {
name: "merge with data on left",
dataLeft: true, dataRight: true,
delLeft: false, delRight: true,
wantRangeDelete: false,
wantGCTimestamp: true,
}, {
name: "merge with no deletes",
dataLeft: true, dataRight: true,
delLeft: false, delRight: false,
wantRangeDelete: false,
wantGCTimestamp: false,
}, {
name: "merge with empty on left",
dataLeft: false, dataRight: true,
delLeft: false, delRight: true,
wantRangeDelete: true,
wantGCTimestamp: true,
}, {
name: "merge with empty on right",
dataLeft: true, dataRight: false,
delLeft: true, delRight: false,
wantRangeDelete: true,
wantGCTimestamp: true,
},
} {
t.Run(d.name, func(t *testing.T) {
ctx := context.Background()
Expand Down Expand Up @@ -5358,8 +5386,10 @@ func TestStoreMergeGCHint(t *testing.T) {

repl := store.LookupReplica(roachpb.RKey(leftKey))
gcHint := repl.GetGCHint()
require.Equal(t, d.expectHint, !gcHint.IsEmpty(), "GC hint is empty after range delete")
if d.expectHint && d.delLeft && d.delRight {
require.Equal(t, d.wantRangeDelete, gcHint.LatestRangeDeleteTimestamp.IsSet())
require.Equal(t, d.wantGCTimestamp, gcHint.GCTimestamp.IsSet())

if d.wantRangeDelete && d.delLeft && d.delRight {
require.Greater(t, gcHint.LatestRangeDeleteTimestamp.WallTime, beforeSecondDel,
"highest timestamp wasn't picked up")
}
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvserver/client_protectedts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
// It works by writing a lot of data and waiting for the GC heuristic to allow
// for GC. Because of this, it's very slow and expensive. It should
// potentially be made cheaper by injecting hooks to force GC.
// TODO(pavelkalinnikov): use the GCHint for this.
//
// Probably this test should always be skipped until it is made cheaper,
// nevertheless it's a useful test.
Expand Down
31 changes: 20 additions & 11 deletions pkg/kv/kvserver/mvcc_gc_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,9 @@ func largeAbortSpan(ms enginepb.MVCCStats) bool {
type mvccGCQueue struct {
*baseQueue

// Set to true when GC finds range that has a hint indicating that range is
// completely cleared.
// lastRangeWasHighPriority is true when GC found a range with a GCHint
// indicating that the range is completely cleared. Reset to false after all
// such ranges have been processed.
lastRangeWasHighPriority bool
// leaseholderCheckInterceptor is a leasholder check used by high priority replica scanner
// its only purpose is to allow test function injection.
Expand Down Expand Up @@ -517,10 +518,20 @@ func makeMVCCGCQueueScoreImpl(
r.FinalScore++
}

maybeRangeDel := suspectedFullRangeDeletion(ms)
hasActiveGCHint := gcHintedRangeDelete(hint, gcTTL, now)
// Check if there is a pending GC run according to the GCHint.
if canGC(hint.GCTimestamp, gcTTL, now) {
// TODO(pavelkalinnikov): using canAdvanceGCThreshold makes the best sense
// now, however we should probably check a stronger condition that the GC
// threshold can be advanced to timestamp >= hint.GCTimestamp. The current
// way can result in false positives and wasteful GC runs.
r.ShouldQueue = canAdvanceGCThreshold
}

if hasActiveGCHint && (maybeRangeDel || ms.ContainsEstimates > 0) {
// Check if GC should be run because the entire keyspace is likely covered by
// MVCC range tombstones.
rangeDeleteHint := canGC(hint.LatestRangeDeleteTimestamp, gcTTL, now)
maybeRangeDel := suspectedFullRangeDeletion(ms)
if rangeDeleteHint && (maybeRangeDel || ms.ContainsEstimates > 0) {
// We have GC hint allowing us to collect range and we either satisfy
// heuristic that indicate no live data or we have estimates and we assume
// hint is correct.
Expand All @@ -537,12 +548,10 @@ func makeMVCCGCQueueScoreImpl(
return r
}

func gcHintedRangeDelete(hint roachpb.GCHint, ttl time.Duration, now hlc.Timestamp) bool {
deleteTimestamp := hint.LatestRangeDeleteTimestamp
if deleteTimestamp.IsEmpty() {
return false
}
return deleteTimestamp.Add(ttl.Nanoseconds(), 0).Less(now)
// canGC returns true iff the given timestamp can be garbage-collected at the
// given moment in time, provided the configured data TTL.
func canGC(t hlc.Timestamp, ttl time.Duration, now hlc.Timestamp) bool {
return t.IsSet() && t.Add(ttl.Nanoseconds(), 0).Less(now)
}

// suspectedFullRangeDeletion checks for ranges where there's no live data and
Expand Down
92 changes: 79 additions & 13 deletions pkg/roachpb/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -916,24 +916,41 @@ func (l Locality) AddTier(tier Tier) Locality {

// IsEmpty returns true if hint contains no data.
func (h *GCHint) IsEmpty() bool {
return h.LatestRangeDeleteTimestamp.IsEmpty()
return h.LatestRangeDeleteTimestamp.IsEmpty() && h.GCTimestamp.IsEmpty()
}

// Merge combines GC hints of two ranges. The result is either a hint that
// covers both ranges or empty hint if it is not possible to merge hints.
// leftEmpty and rightEmpty arguments are set based on MVCCStats.HasNoUserData
// of receiver hint (leftEmpty) and argument hint (rightEmpty).
// Returns true if receiver state was changed.
// Merge combines GC hints of two adjacent ranges. Updates the receiver to be a
// GCHint that covers both ranges, and so can be carried by the merged range.
// Returns true iff the receiver was updated.
//
// The leftEmpty and rightEmpty arguments correspond to MVCCStats.HasNoUserData
// of the receiver hint and the argument RHS hint respectively. These are used
// by a heuristic, to stop carrying the LatestRangeDeleteTimestamp field of the
// hint it the range is likely not covered by MVCC range tombstones (anymore).
//
// Merge is commutative in a sense that the merged hint does not change if the
// order of the LHS and RHS hints (and leftEmpty/rightEmpty, correspondingly) is
// swapped.
func (h *GCHint) Merge(rhs *GCHint, leftEmpty, rightEmpty bool) bool {
// If either side has data but no hint, merged range can't have a hint.
updated := h.ScheduleGCFor(rhs.GCTimestamp)
// NB: don't swap the operands, we need the side effect of the method call.
updated = h.ScheduleGCFor(rhs.GCTimestampNext) || updated

// If LHS or RHS has data but no LatestRangeDeleteTimestamp hint, then this
// side is not known to be covered by range tombstones. Correspondingly, the
// union of the two is not too. If so, clear the hint.
if (rhs.LatestRangeDeleteTimestamp.IsEmpty() && !rightEmpty) ||
(h.LatestRangeDeleteTimestamp.IsEmpty() && !leftEmpty) {
updated := h.LatestRangeDeleteTimestamp.IsSet()
updated = updated || h.LatestRangeDeleteTimestamp.IsSet()
h.LatestRangeDeleteTimestamp = hlc.Timestamp{}
return updated
}
// Otherwise, use the newest hint.
return h.ForwardLatestRangeDeleteTimestamp(rhs.LatestRangeDeleteTimestamp)
// TODO(pavelkalinnikov): handle the case when some side has a hint (i.e. is
// covered by range tombstones), but is not empty. It means that there is data
// on top of the range tombstones, so the ClearRange optimization may not be
// effective. For now, live with the false positive because this is unlikely.

return h.ForwardLatestRangeDeleteTimestamp(rhs.LatestRangeDeleteTimestamp) || updated
}

// ForwardLatestRangeDeleteTimestamp bumps LatestDeleteRangeTimestamp in GC hint
Expand All @@ -946,7 +963,56 @@ func (h *GCHint) ForwardLatestRangeDeleteTimestamp(ts hlc.Timestamp) bool {
return false
}

// ResetLatestRangeDeleteTimestamp resets delete range timestamp.
func (h *GCHint) ResetLatestRangeDeleteTimestamp() {
h.LatestRangeDeleteTimestamp = hlc.Timestamp{}
// ScheduleGCFor updates the hint to schedule eager GC for data up to the given
// timestamp. When this timestamp falls below the GC threshold/TTL, it will be
// eagerly enqueued for GC when considered by the MVCC GC queue.
//
// Returns true iff the hint was updated.
func (h *GCHint) ScheduleGCFor(ts hlc.Timestamp) bool {
if ts.IsEmpty() {
return false
}
if h.GCTimestamp.IsEmpty() {
h.GCTimestamp = ts
} else if cmp := h.GCTimestamp.Compare(ts); cmp > 0 {
if h.GCTimestampNext.IsEmpty() {
h.GCTimestampNext = h.GCTimestamp
}
h.GCTimestamp = ts
} else if cmp == 0 {
return false
} else if h.GCTimestampNext.IsEmpty() {
h.GCTimestampNext = ts
} else if ts.LessEq(h.GCTimestampNext) {
return false
} else {
h.GCTimestampNext = ts
}
return true
}

// UpdateAfterGC updates the GCHint according to the threshold, up to which the
// data has been garbage collected. Returns true iff the hint has been updated.
func (h *GCHint) UpdateAfterGC(gcThreshold hlc.Timestamp) bool {
updated := h.advanceGCTimestamp(gcThreshold)
if t := h.LatestRangeDeleteTimestamp; t.IsSet() && t.LessEq(gcThreshold) {
h.LatestRangeDeleteTimestamp = hlc.Timestamp{}
return true
}
return updated
}

func (h *GCHint) advanceGCTimestamp(gcThreshold hlc.Timestamp) bool {
// If GC threshold is below the minimum, leave the hint intact.
if t := h.GCTimestamp; t.IsEmpty() || gcThreshold.Less(t) {
return false
}
// If min <= threshold < max, erase the min and set it to match the max.
if t := h.GCTimestampNext; t.IsEmpty() || gcThreshold.Less(t) {
h.GCTimestamp, h.GCTimestampNext = h.GCTimestampNext, hlc.Timestamp{}
return true
}
// If threshold >= max, erase both min and max.
h.GCTimestamp, h.GCTimestampNext = hlc.Timestamp{}, hlc.Timestamp{}
return true
}
Loading

0 comments on commit 4518a99

Please sign in to comment.