From b5f5b233dd36e16601014439dac9b00db8b96674 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Thu, 7 Sep 2023 23:34:31 +0100 Subject: [PATCH 1/9] kvserver: document the usage of GCHint The GCHint is currently used for 3 purposes. Document the status quo before making changes. Epic: none Release note: none --- pkg/kv/kvpb/api.proto | 7 +++--- pkg/kv/kvserver/mvcc_gc_queue.go | 5 +++-- pkg/roachpb/metadata.go | 9 +++++++- pkg/roachpb/metadata.proto | 38 ++++++++++++++++++++++++++++++-- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/pkg/kv/kvpb/api.proto b/pkg/kv/kvpb/api.proto index 4375ab8d9f16..fe9fbd99ad96 100644 --- a/pkg/kv/kvpb/api.proto +++ b/pkg/kv/kvpb/api.proto @@ -457,9 +457,10 @@ message DeleteRangeRequest { // considers empty spans equivalent to being covered by an MVCC range // tombstone, so it will omit the write across an entirely empty span too. bool idempotent_tombstone = 7; - // If enabled and DeleteRangeRequest is deleting the whole range with a range - // tombstone, GCHint on replica will be updated to notify mvcc gc queue - // that this range would benefit from optimized GC process. + // If enabled, and a span is deleted using a range tombstone, then the GCHint + // on the corresponding Range will be updated. The hint instructs MVCC GC + // queue to delete this data as soon as it can, and helps optimizing GC for + // bulk deletions. bool update_range_delete_gc_hint = 8 [(gogoproto.customname) = "UpdateRangeDeleteGCHint"]; DeleteRangePredicates predicates = 6 [(gogoproto.nullable) = false]; diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go index cc191b10e63c..b3c6dd56fa9b 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue.go +++ b/pkg/kv/kvserver/mvcc_gc_queue.go @@ -158,8 +158,9 @@ func largeAbortSpan(ms enginepb.MVCCStats) bool { type mvccGCQueue struct { *baseQueue - // Set to true when GC finds range that has a hint indicating that range is - // completely cleared. + // lastRangeWasHighPriority is true when GC found a range with a GCHint + // indicating that the range is completely cleared. Reset to false after all + // such ranges have been processed. lastRangeWasHighPriority bool // leaseholderCheckInterceptor is a leasholder check used by high priority replica scanner // its only purpose is to allow test function injection. diff --git a/pkg/roachpb/metadata.go b/pkg/roachpb/metadata.go index 91d2b1427c19..4425463d7abe 100644 --- a/pkg/roachpb/metadata.go +++ b/pkg/roachpb/metadata.go @@ -925,13 +925,20 @@ func (h *GCHint) IsEmpty() bool { // of receiver hint (leftEmpty) and argument hint (rightEmpty). // Returns true if receiver state was changed. func (h *GCHint) Merge(rhs *GCHint, leftEmpty, rightEmpty bool) bool { - // If either side has data but no hint, merged range can't have a hint. + // If LHS or RHS has data but no LatestRangeDeleteTimestamp hint, then this + // side is not known to be covered by range tombstones. Correspondingly, the + // union of the two is not too. If so, clear the hint. if (rhs.LatestRangeDeleteTimestamp.IsEmpty() && !rightEmpty) || (h.LatestRangeDeleteTimestamp.IsEmpty() && !leftEmpty) { updated := h.LatestRangeDeleteTimestamp.IsSet() h.LatestRangeDeleteTimestamp = hlc.Timestamp{} return updated } + // TODO(pavelkalinnikov): handle the case when some side has a hint (i.e. is + // covered by range tombstones), but is not empty. It means that there is data + // on top of the range tombstones, so the ClearRange optimization may not be + // effective. For now, live with the false positive because this is unlikely. + // Otherwise, use the newest hint. return h.ForwardLatestRangeDeleteTimestamp(rhs.LatestRangeDeleteTimestamp) } diff --git a/pkg/roachpb/metadata.proto b/pkg/roachpb/metadata.proto index 6bf392f31e8e..939b219f9b8e 100644 --- a/pkg/roachpb/metadata.proto +++ b/pkg/roachpb/metadata.proto @@ -463,7 +463,41 @@ message Version { message GCHint { option (gogoproto.equal) = true; - // LatestRangeDeleteTimestamp indicates timestamp at which all data was deleted - // in the range using a range tombstone. No such delete happened if empty. + // LatestRangeDeleteTimestamp is a timestamp at which the *entire* key span of + // the range was deleted using MVCC range tombstones. This hint is maintained + // with best effort, and may have false positives and negatives. + // + // It helps the cases when large key spans are deleted, e.g. table/index drops + // in SQL. In such cases it is likely that there will be no writes to the + // keyspace of this Range at timestamps above LatestRangeDeleteTimestamp. GC + // can then delete this data using the efficient ClearRange operation. + // + // Generally, we can't guarantee no writes above LatestRangeDeleteTimestamp, + // so we don't rely on this hint for correctness. When GC uses this hint, it + // double-checks while holding a latch that the range hasn't had any writes + // above this timestamp. + // + // Initially, when a large key span is deleted, this hint is accurate for all + // the affected ranges. As the time goes, more data can be written to these + // ranges, or they may be split/merged with other ranges. We do best effort to + // remove this hint as soon as we know the optimization won't be effective. + // + // For example, we don't propagate this hint when merging two ranges one of + // which doesn't carry this hint, because we know that the entire span is + // probably not covered by range tombstones. + // + // TODO(pavelkalinnikov): this hint is also used for two other purposes, + // factor them out. + // + // - Upper layers can set this hint if they want to expedite GC process for + // this range. For example, on deleting a SQL table/index, the schema change + // job is blocked until GC removes the data entirely, so it sets the hint. + // This case is buggy, see #107931. + // + // - When this hint is set, it is likely that adjacent ranges have it too, + // especially in bulk deletion situations. Once the GC queue sees this hint, + // it scans meta to find all the ranges that have it set too, and GC them + // with higher priority / lower delay. Ultimately, this reduces the cost of + // Pebble compactions that can be triggered by writing range tombstones. optional util.hlc.Timestamp latest_range_delete_timestamp = 1 [(gogoproto.nullable) = false]; } From 5199815863333ae53fe647667daa40d428ae5d01 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Thu, 7 Sep 2023 23:40:48 +0100 Subject: [PATCH 2/9] kvserver: add GCHint.UpdateAfterGC method This is a cosmetic commit. It replaces a lower-level GCHint "reset" method with a method that takes an effective GC threshold. Epic: none Release note: none --- pkg/kv/kvserver/batcheval/cmd_gc.go | 21 +++++++++------------ pkg/roachpb/metadata.go | 11 ++++++++--- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/cmd_gc.go b/pkg/kv/kvserver/batcheval/cmd_gc.go index b9140ff056b1..1e48463bb6c9 100644 --- a/pkg/kv/kvserver/batcheval/cmd_gc.go +++ b/pkg/kv/kvserver/batcheval/cmd_gc.go @@ -277,18 +277,15 @@ func GC( if err != nil { return result.Result{}, err } - if !hint.IsEmpty() { - if hint.LatestRangeDeleteTimestamp.LessEq(gcThreshold) { - hint.ResetLatestRangeDeleteTimestamp() - // NB: Replicated.State can already contain GCThreshold from above. Make - // sure we don't accidentally remove it. - if res.Replicated.State == nil { - res.Replicated.State = &kvserverpb.ReplicaState{} - } - res.Replicated.State.GCHint = hint - if _, err := sl.SetGCHint(ctx, readWriter, cArgs.Stats, hint); err != nil { - return result.Result{}, err - } + if hint.UpdateAfterGC(gcThreshold) { + // NB: Replicated.State can already contain GCThreshold from above. Make + // sure we don't accidentally remove it. + if res.Replicated.State == nil { + res.Replicated.State = &kvserverpb.ReplicaState{} + } + res.Replicated.State.GCHint = hint + if _, err := sl.SetGCHint(ctx, readWriter, cArgs.Stats, hint); err != nil { + return result.Result{}, err } } } diff --git a/pkg/roachpb/metadata.go b/pkg/roachpb/metadata.go index 4425463d7abe..312d55f72562 100644 --- a/pkg/roachpb/metadata.go +++ b/pkg/roachpb/metadata.go @@ -953,7 +953,12 @@ func (h *GCHint) ForwardLatestRangeDeleteTimestamp(ts hlc.Timestamp) bool { return false } -// ResetLatestRangeDeleteTimestamp resets delete range timestamp. -func (h *GCHint) ResetLatestRangeDeleteTimestamp() { - h.LatestRangeDeleteTimestamp = hlc.Timestamp{} +// UpdateAfterGC updates the GCHint according to the threshold, up to which the +// data has been garbage collected. Returns true iff the hint has been updated. +func (h *GCHint) UpdateAfterGC(gcThreshold hlc.Timestamp) bool { + if t := h.LatestRangeDeleteTimestamp; t.IsSet() && t.LessEq(gcThreshold) { + h.LatestRangeDeleteTimestamp = hlc.Timestamp{} + return true + } + return false } From 5f5ee8d99b56e14dbb8e748164a0ab2cbbf9ccc4 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Tue, 5 Sep 2023 21:15:23 +0100 Subject: [PATCH 3/9] kvserver: propagate GCHint for partial deletions Previously, GCHint was used for 3 purposes: 1. Instruct GC to run at specific times. This is used by SQL table/index drop jobs, to limit the amount of time they wait on data GC. 2. Hint GC that a range is fully covered by range tombstones, so that GC can use ClearRange for faster deletes. This is useful for bulk data deletions as well. 3. Hint GC that a range was fully covered by range tombstones at some recent point. GC prioritizes going through such ranges, and processes them together, because this makes Pebble compaction more efficient. The use-case 1 was broken. If a range tombstone does not fully cover the Range keyspace, GCHint is not written. Correspondingly, for small table/index deletions (spanning less than a single range), or deletions that don't perfectly match range bounds, there will be some ranges with no hints. For these ranges, GC might be arbitrarily delayed, and the schema change jobs would be stuck. In addition, GCHint propagation during merges was lossy. It could drop the hint if either LHS or RHS doesn't have a hint, or has some data. This commit extends GCHint to fix the use-case 1. Two fields are added: min and max deletion timestamp. These fields are populated even for partial range deletions, and are propagated during splits and merges unconditionally. The choice of min-max approach is explained in the comments to the corresponding GCHint fields. Release note (bug fix): Fixed a bug that could occasionally cause schema change jobs (e.g. table/index drops) to appear stuck in state "waiting for MVCC GC" for much longer than expected. The fix only applies to future schema changes -- existing stuck jobs can be processed by manually force-enqueueing the relevant ranges in the MVCC GC queue under the DB Console's advanced debug page. Epic: none --- pkg/kv/kvserver/batcheval/cmd_delete_range.go | 19 +++-- pkg/kv/kvserver/client_merge_test.go | 68 +++++++++------- pkg/kv/kvserver/client_protectedts_test.go | 1 + pkg/kv/kvserver/mvcc_gc_queue.go | 26 ++++--- pkg/roachpb/metadata.go | 74 +++++++++++++++--- pkg/roachpb/metadata.proto | 77 ++++++++++++++++--- 6 files changed, 199 insertions(+), 66 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range.go b/pkg/kv/kvserver/batcheval/cmd_delete_range.go index 969918e53bbd..8158507d0c3f 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range.go @@ -123,21 +123,24 @@ func DeleteRange( if !args.UpdateRangeDeleteGCHint { return nil } - // If GCHint was provided, then we need to check if this request meets - // range gc criteria of removing all data. This is not an error as range - // might have merged since request was sent and we don't want to fail - // deletion. - if !args.Key.Equal(desc.StartKey.AsRawKey()) || !args.EndKey.Equal(desc.EndKey.AsRawKey()) { - return nil - } sl := MakeStateLoader(cArgs.EvalCtx) hint, err := sl.LoadGCHint(ctx, readWriter) if err != nil { return err } - if !hint.ForwardLatestRangeDeleteTimestamp(h.Timestamp) { + + // Add the timestamp to GCHint to guarantee that GC eventually clears it. + updated := hint.ScheduleGCFor(h.Timestamp) + // If the range tombstone covers the whole Range key span, update the + // corresponding timestamp in GCHint to enable ClearRange optimization. + if args.Key.Equal(desc.StartKey.AsRawKey()) && args.EndKey.Equal(desc.EndKey.AsRawKey()) { + // NB: don't swap the order, we want to call the method unconditionally. + updated = hint.ForwardLatestRangeDeleteTimestamp(h.Timestamp) || updated + } + if !updated { return nil } + if updated, err := sl.SetGCHint(ctx, readWriter, cArgs.Stats, hint); err != nil || !updated { return err } diff --git a/pkg/kv/kvserver/client_merge_test.go b/pkg/kv/kvserver/client_merge_test.go index dc5f76fe1db6..8cd37c5ac6a2 100644 --- a/pkg/kv/kvserver/client_merge_test.go +++ b/pkg/kv/kvserver/client_merge_test.go @@ -5251,41 +5251,53 @@ func TestStoreMergeGCHint(t *testing.T) { name string dataLeft, dataRight bool delLeft, delRight bool - expectHint bool + + wantRangeDelete bool // the hint must indicate a whole range deletion + wantGCTimestamp bool // the hint must indicate a pending GC timestamp }{ { - name: "merge similar ranges", - dataLeft: true, - dataRight: true, - delLeft: true, - delRight: true, - expectHint: true, + name: "merge similar ranges", + dataLeft: true, + dataRight: true, + delLeft: true, + delRight: true, + + wantRangeDelete: true, + wantGCTimestamp: true, }, { - name: "merge with data on right", - dataLeft: true, - dataRight: true, - delLeft: true, - expectHint: false, + name: "merge with data on right", + dataLeft: true, + dataRight: true, + delLeft: true, + + wantRangeDelete: false, + wantGCTimestamp: true, }, { - name: "merge with data on left", - dataLeft: true, - dataRight: true, - delRight: true, - expectHint: false, + name: "merge with data on left", + dataLeft: true, + dataRight: true, + delRight: true, + + wantRangeDelete: false, + wantGCTimestamp: true, }, { - name: "merge with empty on left", - dataRight: true, - delRight: true, - expectHint: true, + name: "merge with empty on left", + dataRight: true, + delRight: true, + + wantRangeDelete: true, + wantGCTimestamp: true, }, { - name: "merge with empty on right", - dataLeft: true, - delLeft: true, - expectHint: true, + name: "merge with empty on right", + dataLeft: true, + delLeft: true, + + wantRangeDelete: true, + wantGCTimestamp: true, }, } { t.Run(d.name, func(t *testing.T) { @@ -5358,8 +5370,10 @@ func TestStoreMergeGCHint(t *testing.T) { repl := store.LookupReplica(roachpb.RKey(leftKey)) gcHint := repl.GetGCHint() - require.Equal(t, d.expectHint, !gcHint.IsEmpty(), "GC hint is empty after range delete") - if d.expectHint && d.delLeft && d.delRight { + require.Equal(t, d.wantRangeDelete, gcHint.LatestRangeDeleteTimestamp.IsSet()) + require.Equal(t, d.wantGCTimestamp, gcHint.GCTimestamp.IsSet()) + + if d.wantRangeDelete && d.delLeft && d.delRight { require.Greater(t, gcHint.LatestRangeDeleteTimestamp.WallTime, beforeSecondDel, "highest timestamp wasn't picked up") } diff --git a/pkg/kv/kvserver/client_protectedts_test.go b/pkg/kv/kvserver/client_protectedts_test.go index 4fb1bfe9dab6..c8aa5d147edc 100644 --- a/pkg/kv/kvserver/client_protectedts_test.go +++ b/pkg/kv/kvserver/client_protectedts_test.go @@ -41,6 +41,7 @@ import ( // It works by writing a lot of data and waiting for the GC heuristic to allow // for GC. Because of this, it's very slow and expensive. It should // potentially be made cheaper by injecting hooks to force GC. +// TODO(pavelkalinnikov): use the GCHint for this. // // Probably this test should always be skipped until it is made cheaper, // nevertheless it's a useful test. diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go index b3c6dd56fa9b..a38cd8952517 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue.go +++ b/pkg/kv/kvserver/mvcc_gc_queue.go @@ -501,10 +501,20 @@ func makeMVCCGCQueueScoreImpl( r.FinalScore++ } - maybeRangeDel := suspectedFullRangeDeletion(ms) - hasActiveGCHint := gcHintedRangeDelete(hint, gcTTL, now) + // Check if there is a pending GC run according to the GCHint. + if canGC(hint.GCTimestamp, gcTTL, now) { + // TODO(pavelkalinnikov): using canAdvanceGCThreshold makes the best sense + // now, however we should probably check a stronger condition that the GC + // threshold can be advanced to timestamp >= hint.GCTimestamp. The current + // way can result in false positives and wasteful GC runs. + r.ShouldQueue = canAdvanceGCThreshold + } - if hasActiveGCHint && (maybeRangeDel || ms.ContainsEstimates > 0) { + // Check if GC should be run because the entire keyspace is likely covered by + // MVCC range tombstones. + rangeDeleteHint := canGC(hint.LatestRangeDeleteTimestamp, gcTTL, now) + maybeRangeDel := suspectedFullRangeDeletion(ms) + if rangeDeleteHint && (maybeRangeDel || ms.ContainsEstimates > 0) { // We have GC hint allowing us to collect range and we either satisfy // heuristic that indicate no live data or we have estimates and we assume // hint is correct. @@ -521,12 +531,10 @@ func makeMVCCGCQueueScoreImpl( return r } -func gcHintedRangeDelete(hint roachpb.GCHint, ttl time.Duration, now hlc.Timestamp) bool { - deleteTimestamp := hint.LatestRangeDeleteTimestamp - if deleteTimestamp.IsEmpty() { - return false - } - return deleteTimestamp.Add(ttl.Nanoseconds(), 0).Less(now) +// canGC returns true iff the given timestamp can be garbage-collected at the +// given moment in time, provided the configured data TTL. +func canGC(t hlc.Timestamp, ttl time.Duration, now hlc.Timestamp) bool { + return t.IsSet() && t.Add(ttl.Nanoseconds(), 0).Less(now) } // suspectedFullRangeDeletion checks for ranges where there's no live data and diff --git a/pkg/roachpb/metadata.go b/pkg/roachpb/metadata.go index 312d55f72562..bc64d70f52f9 100644 --- a/pkg/roachpb/metadata.go +++ b/pkg/roachpb/metadata.go @@ -916,21 +916,32 @@ func (l Locality) AddTier(tier Tier) Locality { // IsEmpty returns true if hint contains no data. func (h *GCHint) IsEmpty() bool { - return h.LatestRangeDeleteTimestamp.IsEmpty() + return h.LatestRangeDeleteTimestamp.IsEmpty() && h.GCTimestamp.IsEmpty() } -// Merge combines GC hints of two ranges. The result is either a hint that -// covers both ranges or empty hint if it is not possible to merge hints. -// leftEmpty and rightEmpty arguments are set based on MVCCStats.HasNoUserData -// of receiver hint (leftEmpty) and argument hint (rightEmpty). -// Returns true if receiver state was changed. +// Merge combines GC hints of two adjacent ranges. Updates the receiver to be a +// GCHint that covers both ranges, and so can be carried by the merged range. +// Returns true iff the receiver was updated. +// +// The leftEmpty and rightEmpty arguments correspond to MVCCStats.HasNoUserData +// of the receiver hint and the argument RHS hint respectively. These are used +// by a heuristic, to stop carrying the LatestRangeDeleteTimestamp field of the +// hint it the range is likely not covered by MVCC range tombstones (anymore). +// +// Merge is commutative in a sense that the merged hint does not change if the +// order of the LHS and RHS hints (and leftEmpty/rightEmpty, correspondingly) is +// swapped. func (h *GCHint) Merge(rhs *GCHint, leftEmpty, rightEmpty bool) bool { + updated := h.ScheduleGCFor(rhs.GCTimestamp) + // NB: don't swap the operands, we need the side effect of the method call. + updated = h.ScheduleGCFor(rhs.GCTimestampNext) || updated + // If LHS or RHS has data but no LatestRangeDeleteTimestamp hint, then this // side is not known to be covered by range tombstones. Correspondingly, the // union of the two is not too. If so, clear the hint. if (rhs.LatestRangeDeleteTimestamp.IsEmpty() && !rightEmpty) || (h.LatestRangeDeleteTimestamp.IsEmpty() && !leftEmpty) { - updated := h.LatestRangeDeleteTimestamp.IsSet() + updated = updated || h.LatestRangeDeleteTimestamp.IsSet() h.LatestRangeDeleteTimestamp = hlc.Timestamp{} return updated } @@ -939,8 +950,7 @@ func (h *GCHint) Merge(rhs *GCHint, leftEmpty, rightEmpty bool) bool { // on top of the range tombstones, so the ClearRange optimization may not be // effective. For now, live with the false positive because this is unlikely. - // Otherwise, use the newest hint. - return h.ForwardLatestRangeDeleteTimestamp(rhs.LatestRangeDeleteTimestamp) + return h.ForwardLatestRangeDeleteTimestamp(rhs.LatestRangeDeleteTimestamp) || updated } // ForwardLatestRangeDeleteTimestamp bumps LatestDeleteRangeTimestamp in GC hint @@ -953,12 +963,56 @@ func (h *GCHint) ForwardLatestRangeDeleteTimestamp(ts hlc.Timestamp) bool { return false } +// ScheduleGCFor updates the hint to schedule eager GC for data up to the given +// timestamp. When this timestamp falls below the GC threshold/TTL, it will be +// eagerly enqueued for GC when considered by the MVCC GC queue. +// +// Returns true iff the hint was updated. +func (h *GCHint) ScheduleGCFor(ts hlc.Timestamp) bool { + if ts.IsEmpty() { + return false + } + if h.GCTimestamp.IsEmpty() { + h.GCTimestamp = ts + } else if cmp := h.GCTimestamp.Compare(ts); cmp > 0 { + if h.GCTimestampNext.IsEmpty() { + h.GCTimestampNext = h.GCTimestamp + } + h.GCTimestamp = ts + } else if cmp == 0 { + return false + } else if h.GCTimestampNext.IsEmpty() { + h.GCTimestampNext = ts + } else if ts.LessEq(h.GCTimestampNext) { + return false + } else { + h.GCTimestampNext = ts + } + return true +} + // UpdateAfterGC updates the GCHint according to the threshold, up to which the // data has been garbage collected. Returns true iff the hint has been updated. func (h *GCHint) UpdateAfterGC(gcThreshold hlc.Timestamp) bool { + updated := h.advanceGCTimestamp(gcThreshold) if t := h.LatestRangeDeleteTimestamp; t.IsSet() && t.LessEq(gcThreshold) { h.LatestRangeDeleteTimestamp = hlc.Timestamp{} return true } - return false + return updated +} + +func (h *GCHint) advanceGCTimestamp(gcThreshold hlc.Timestamp) bool { + // If GC threshold is below the minimum, leave the hint intact. + if t := h.GCTimestamp; t.IsEmpty() || gcThreshold.Less(t) { + return false + } + // If min <= threshold < max, erase the min and set it to match the max. + if t := h.GCTimestampNext; t.IsEmpty() || gcThreshold.Less(t) { + h.GCTimestamp, h.GCTimestampNext = h.GCTimestampNext, hlc.Timestamp{} + return true + } + // If threshold >= max, erase both min and max. + h.GCTimestamp, h.GCTimestampNext = hlc.Timestamp{}, hlc.Timestamp{} + return true } diff --git a/pkg/roachpb/metadata.proto b/pkg/roachpb/metadata.proto index 939b219f9b8e..f529efa3bd5c 100644 --- a/pkg/roachpb/metadata.proto +++ b/pkg/roachpb/metadata.proto @@ -486,18 +486,71 @@ message GCHint { // which doesn't carry this hint, because we know that the entire span is // probably not covered by range tombstones. // - // TODO(pavelkalinnikov): this hint is also used for two other purposes, - // factor them out. + // TODO(pavelkalinnikov): this is used for one other purpose, factor it out. // - // - Upper layers can set this hint if they want to expedite GC process for - // this range. For example, on deleting a SQL table/index, the schema change - // job is blocked until GC removes the data entirely, so it sets the hint. - // This case is buggy, see #107931. - // - // - When this hint is set, it is likely that adjacent ranges have it too, - // especially in bulk deletion situations. Once the GC queue sees this hint, - // it scans meta to find all the ranges that have it set too, and GC them - // with higher priority / lower delay. Ultimately, this reduces the cost of - // Pebble compactions that can be triggered by writing range tombstones. + // When this hint is set, it is likely that adjacent ranges have it too, + // especially in bulk deletion situations. Once the GC queue sees this hint, + // it scans all the local leaseholder replicas to find the one with this hint + // too, and GC them with higher priority / lower delay. Ultimately, this + // reduces the cost of Pebble compactions that can be triggered by writing + // range tombstones. optional util.hlc.Timestamp latest_range_delete_timestamp = 1 [(gogoproto.nullable) = false]; + + // GCTimestamp is the lowest "pending" timestamp for which some data in the + // range must be garbage collected. + // + // Typically, this is left up to the MVCC GC job to decide when to delete + // data. Because of the way the GC scoring works, it may take a long time + // until such a decision is made. Some upper layer jobs require a more timely + // deletion. For example, when a SQL table/index is dropped, there is a schema + // change job waiting on the GC. + // + // We carry this hint to help expedite GC in such cases. Garbage collection is + // triggered shortly after the time is past GCTimestamp + TTL. + // + // It is possible that there are multiple timestamps for which some data must + // be deleted, e.g. if multiple SQL tables touching the same Range are + // dropped. We could carry all such timestamps in this GCHint for accuracy, + // but we would need extra measures to prevent uncontrollable growth of this + // list, and excessive number of GC runs it could cause. + // + // Instead, we compress this list of timestamps to a pair of GCTimestamp and + // GCTimestampNext, storing the min and max (if different from min) timestamp + // correspondingly. + // + // Since a GC run covers all timestamps <= threshold, preserving the max + // timestamp as a threshold guarantees that all pending timestamps <= max are + // covered. However, the max pending timestamp can always be bumped up by + // newer and newer writes, and never get below now() - TTL. To work around + // this race, we also preserve the min pending timestamp, and update it only + // when it has been garbage collected. + // + // With such a structure of the hint, we have a couple of guarantees: + // + // 1. The number of expedited GC runs on this range does not exceed 2 per TTL. + // + // Worst case: min < max < min+TTL = now. GC runs at time now = min+TTL, and + // maybe at max+TTL or later time. Since any new write is at timestamp >= now, + // the next GC run after these two is at least TTL away from now. + // + // 2. The max delay of expedited GC is close to one TTL, i.e. in the worst + // case the GC will run at timestamp + 2*TTL. + // + // Worst case: now = max = min+TTL. GC runs at time now and clears the min + // timestamp. The only remaining timestamp (max) is min+TTL. The next possible + // expedited GC run is thus at min+2*TTL. We may have omitted some timestamps + // between min and max, and they are now <= 2*TTL away from the next GC run. + optional util.hlc.Timestamp gc_timestamp = 2 [(gogoproto.nullable) = false, + (gogoproto.customname) = "GCTimestamp"]; + // GCTimestampNext is the largest "pending" timestamp for which some data in + // the range must be garbage collected. See GCTimestamp comment for more + // details. + // + // INVARIANT: if GCTimestamp is empty, then GCTimestampNext is empty. + // INVARIANT: if GCTimestampNext is set, it exceeds GCTimestamp. + // + // NB: if GCTimestamp is set and GCTimestampNext is empty, it means there is + // only one pending timestamp for garbage collection. + optional util.hlc.Timestamp gc_timestamp_next = 3 [(gogoproto.nullable) = false, + (gogoproto.customname) = "GCTimestampNext"]; } From 855afa718fa7627aa05d62f119645f4715ff4c1e Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Tue, 12 Sep 2023 15:55:04 +0100 Subject: [PATCH 4/9] kvserver: cover more cases in TestStoreMergeGCHint Epic: none Release note: none --- pkg/kv/kvserver/client_merge_test.go | 106 +++++++++++++++------------ 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/pkg/kv/kvserver/client_merge_test.go b/pkg/kv/kvserver/client_merge_test.go index 8cd37c5ac6a2..98ce6404125f 100644 --- a/pkg/kv/kvserver/client_merge_test.go +++ b/pkg/kv/kvserver/client_merge_test.go @@ -5254,51 +5254,67 @@ func TestStoreMergeGCHint(t *testing.T) { wantRangeDelete bool // the hint must indicate a whole range deletion wantGCTimestamp bool // the hint must indicate a pending GC timestamp - }{ - { - name: "merge similar ranges", - dataLeft: true, - dataRight: true, - delLeft: true, - delRight: true, - - wantRangeDelete: true, - wantGCTimestamp: true, - }, - { - name: "merge with data on right", - dataLeft: true, - dataRight: true, - delLeft: true, - - wantRangeDelete: false, - wantGCTimestamp: true, - }, - { - name: "merge with data on left", - dataLeft: true, - dataRight: true, - delRight: true, - - wantRangeDelete: false, - wantGCTimestamp: true, - }, - { - name: "merge with empty on left", - dataRight: true, - delRight: true, - - wantRangeDelete: true, - wantGCTimestamp: true, - }, - { - name: "merge with empty on right", - dataLeft: true, - delLeft: true, - - wantRangeDelete: true, - wantGCTimestamp: true, - }, + }{{ + name: "merge empty ranges", + dataLeft: false, dataRight: false, + delLeft: false, delRight: false, + wantRangeDelete: false, + wantGCTimestamp: false, + }, { + name: "merge after deleting empty left", + dataLeft: false, dataRight: false, + delLeft: true, delRight: false, + wantRangeDelete: true, + wantGCTimestamp: true, + }, { + name: "merge after deleting empty right", + dataLeft: false, dataRight: false, + delLeft: false, delRight: true, + wantRangeDelete: true, + wantGCTimestamp: true, + }, { + name: "merge after deleting both empty", + dataLeft: false, dataRight: false, + delLeft: true, delRight: true, + wantRangeDelete: true, + wantGCTimestamp: true, + }, { + name: "merge similar ranges", + dataLeft: true, dataRight: true, + delLeft: true, delRight: true, + wantRangeDelete: true, + wantGCTimestamp: true, + }, { + name: "merge with data on right", + dataLeft: true, dataRight: true, + delLeft: true, delRight: false, + wantRangeDelete: false, + wantGCTimestamp: true, + }, { + name: "merge with data on left", + dataLeft: true, dataRight: true, + delLeft: false, delRight: true, + wantRangeDelete: false, + wantGCTimestamp: true, + }, { + name: "merge with no deletes", + dataLeft: true, dataRight: true, + delLeft: false, delRight: false, + wantRangeDelete: false, + wantGCTimestamp: false, + }, { + name: "merge with empty on left", + dataLeft: false, dataRight: true, + delLeft: false, delRight: true, + wantRangeDelete: true, + wantGCTimestamp: true, + }, { + name: "merge with empty on right", + dataLeft: true, dataRight: false, + delLeft: true, delRight: false, + wantRangeDelete: true, + wantGCTimestamp: true, + }, } { t.Run(d.name, func(t *testing.T) { ctx := context.Background() From e95b683265f2f90cfe10ae10fc6a2d28afcc968f Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Fri, 8 Sep 2023 12:34:35 +0100 Subject: [PATCH 5/9] roachpb: add unit tests for GCHint.Merge The test checks that all the GCHint invariants hold before/after the Merge operation; that the Merge operation is commutative; and that Merge returns true iff it modified the hint. For now, the test only Merges an empty hint with a non-empty hint. Soon adding tests for non-empty + non-empty cases. The test helped to fix one bug in Merge commutativity. Epic: none Release note: none --- pkg/roachpb/metadata_test.go | 76 ++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/pkg/roachpb/metadata_test.go b/pkg/roachpb/metadata_test.go index cc95cff2c4fa..047af1f40087 100644 --- a/pkg/roachpb/metadata_test.go +++ b/pkg/roachpb/metadata_test.go @@ -17,6 +17,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -462,3 +463,78 @@ func TestAddTier(t *testing.T) { require.Equal(t, l2, l1.AddTier(Tier{Key: "foo", Value: "bar"})) require.Equal(t, l3, l2.AddTier(Tier{Key: "bar", Value: "foo"})) } + +func TestGCHint(t *testing.T) { + var empty hlc.Timestamp + ts1, ts2, ts3 := makeTS(1234, 2), makeTS(2345, 0), makeTS(3456, 10) + hint := func(rangeT, minT, maxT hlc.Timestamp) GCHint { + return GCHint{ + LatestRangeDeleteTimestamp: rangeT, + GCTimestamp: minT, + GCTimestampNext: maxT, + } + } + + // checkInvariants verifies that the GCHint is well-formed. + checkInvariants := func(t *testing.T, hint GCHint) { + t.Helper() + if hint.GCTimestamp.IsEmpty() { + require.True(t, hint.GCTimestampNext.IsEmpty()) + } + if hint.GCTimestampNext.IsSet() { + require.True(t, hint.GCTimestamp.Less(hint.GCTimestampNext)) + } + } + checkInvariants(t, GCHint{}) + + // merge runs GCHint.Merge with the given parameters, and verifies that the + // semantics of the returned "updated" bool are satisfied. + merge := func(t *testing.T, lhs, rhs GCHint, leftEmtpy, rightEmpty bool) GCHint { + t.Helper() + before := lhs + updated := lhs.Merge(&rhs, leftEmtpy, rightEmpty) + require.Equal(t, !lhs.Equal(before), updated, "Merge return value incorrect") + return lhs + } + // checkMerge runs GCHint.Merge, and verifies that: + // - The result of merging 2 hints does not depend on the order. + // - Merge returns true iff it modified the hint. + // - Merge returns GCHint conforming with the invariants. + checkMerge := func(t *testing.T, lhs, rhs GCHint, leftEmpty, rightEmpty bool) GCHint { + t.Helper() + res1 := merge(t, lhs, rhs, leftEmpty, rightEmpty) + res2 := merge(t, rhs, lhs, rightEmpty, leftEmpty) + require.Equal(t, res1, res2, "Merge is not commutative") + checkInvariants(t, res1) + return res1 + } + + // Test the effect of GCHint.Merge with an empty hint. Additionally, test that + // the result of such a Merge does not depend on the order, i.e. Merge is + // commutative; and that Merge returns a correct "updated" bool. + for _, h := range []GCHint{ + hint(empty, empty, empty), + hint(empty, ts1, empty), + hint(empty, ts1, ts3), + hint(ts1, empty, empty), + hint(ts1, ts1, ts3), + hint(ts2, ts1, ts3), + } { + t.Run("Merge-with-empty-hint", func(t *testing.T) { + for _, lr := range [][]bool{ + {false, false}, + {false, true}, + {true, false}, + {true, true}, + } { + t.Run(fmt.Sprintf("leftEmpty=%v/rightEmpty=%v", lr[0], lr[1]), func(t *testing.T) { + leftEmpty, rightEmpty := lr[0], lr[1] + checkInvariants(t, h) + checkMerge(t, h, GCHint{}, leftEmpty, rightEmpty) + }) + } + }) + } + + // TODO(pavelkalinnikov): test Merge with non-empty hints. +} From 247c2c8f68bda3452d91067213d61fe4d5230ac0 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Mon, 11 Sep 2023 14:21:00 +0100 Subject: [PATCH 6/9] roachpb: unit tests for GCHint.ScheduleGCFor The test checks that all the GCHint invariants hold before/after the ScheduleGCFor operation, and that it returns true iff it modified the hint. The test helped to fix one bug in the method implementation. Epic: none Release note: none --- pkg/roachpb/metadata_test.go | 44 +++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/pkg/roachpb/metadata_test.go b/pkg/roachpb/metadata_test.go index 047af1f40087..8f8b07744e27 100644 --- a/pkg/roachpb/metadata_test.go +++ b/pkg/roachpb/metadata_test.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/errors" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -535,6 +536,47 @@ func TestGCHint(t *testing.T) { } }) } - // TODO(pavelkalinnikov): test Merge with non-empty hints. + + for _, tc := range []struct { + was GCHint + add hlc.Timestamp + want GCHint + }{ + // Adding an empty timestamp is a no-op. + {was: GCHint{}, add: empty, want: GCHint{}}, + {was: hint(empty, ts1, empty), add: empty, want: hint(empty, ts1, empty)}, + {was: hint(empty, ts1, ts2), add: empty, want: hint(empty, ts1, ts2)}, + {was: hint(ts1, ts1, ts2), add: empty, want: hint(ts1, ts1, ts2)}, + // Any timestamp is added to a hint with empty timestamps. + {was: GCHint{}, add: ts1, want: hint(empty, ts1, empty)}, + {was: hint(ts1, empty, empty), add: ts1, want: hint(ts1, ts1, empty)}, + {was: hint(ts2, empty, empty), add: ts1, want: hint(ts2, ts1, empty)}, + // For a hint with only one timestamp, test all possible relative positions + // of the newly added timestamp. + {was: hint(empty, ts2, empty), add: ts1, want: hint(empty, ts1, ts2)}, + {was: hint(empty, ts2, empty), add: ts2, want: hint(empty, ts2, empty)}, // no-op + {was: hint(empty, ts1, empty), add: ts2, want: hint(empty, ts1, ts2)}, + {was: hint(ts1, ts1, empty), add: ts2, want: hint(ts1, ts1, ts2)}, + // For a hint with both timestamps, test all possible relative positions of + // the newly added timestamp. + {was: hint(empty, ts2, ts3), add: ts1, want: hint(empty, ts1, ts3)}, + {was: hint(empty, ts2, ts3), add: ts2, want: hint(empty, ts2, ts3)}, // no-op + {was: hint(empty, ts1, ts3), add: ts2, want: hint(empty, ts1, ts3)}, // no-op + {was: hint(empty, ts1, ts3), add: ts3, want: hint(empty, ts1, ts3)}, // no-op + {was: hint(empty, ts1, ts2), add: ts2, want: hint(empty, ts1, ts2)}, // no-op + {was: hint(empty, ts1, ts2), add: ts3, want: hint(empty, ts1, ts3)}, + {was: hint(ts1, ts1, ts2), add: ts3, want: hint(ts1, ts1, ts3)}, + {was: hint(ts2, ts1, ts2), add: ts3, want: hint(ts2, ts1, ts3)}, + {was: hint(ts3, ts1, ts2), add: ts3, want: hint(ts3, ts1, ts3)}, + } { + t.Run("ScheduleGCFor", func(t *testing.T) { + hint := tc.was + checkInvariants(t, hint) + updated := hint.ScheduleGCFor(tc.add) + checkInvariants(t, hint) + assert.Equal(t, !hint.Equal(tc.was), updated, "returned incorrect 'updated' bit") + assert.Equal(t, tc.want, hint) + }) + } } From 5859a704cdf7150d6f1767f9f92b235aa99bf7d5 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Mon, 11 Sep 2023 14:52:07 +0100 Subject: [PATCH 7/9] roachpb: unit test for GCHint.UpdateAfterGC Epic: none Release note: none --- pkg/roachpb/metadata_test.go | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pkg/roachpb/metadata_test.go b/pkg/roachpb/metadata_test.go index 8f8b07744e27..2cd523a4db17 100644 --- a/pkg/roachpb/metadata_test.go +++ b/pkg/roachpb/metadata_test.go @@ -579,4 +579,38 @@ func TestGCHint(t *testing.T) { assert.Equal(t, tc.want, hint) }) } + + for _, tc := range []struct { + was GCHint + gced hlc.Timestamp + want GCHint + }{ + // Check empty timestamp cases. + {was: GCHint{}, gced: empty, want: GCHint{}}, + {was: GCHint{}, gced: ts1, want: GCHint{}}, + {was: hint(empty, ts1, empty), gced: empty, want: hint(empty, ts1, empty)}, + {was: hint(ts2, ts1, ts3), gced: empty, want: hint(ts2, ts1, ts3)}, + {was: hint(ts2, ts1, empty), gced: empty, want: hint(ts2, ts1, empty)}, + // Check that the GC hint is updated correctly with all relative positions + // of the threshold. + {was: hint(empty, ts2, ts3), gced: ts1, want: hint(empty, ts2, ts3)}, // no-op + {was: hint(empty, ts2, ts3), gced: ts2, want: hint(empty, ts3, empty)}, + {was: hint(empty, ts1, ts3), gced: ts2, want: hint(empty, ts3, empty)}, + {was: hint(empty, ts1, ts3), gced: ts3, want: GCHint{}}, + {was: hint(empty, ts1, ts2), gced: ts3, want: GCHint{}}, + // Check that the entire-range hint is updated correctly with all relative + // positions of the threshold. + {was: hint(ts2, empty, empty), gced: ts1, want: hint(ts2, empty, empty)}, + {was: hint(ts2, empty, empty), gced: ts2, want: GCHint{}}, + {was: hint(ts2, empty, empty), gced: ts3, want: GCHint{}}, + } { + t.Run("UpdateAfterGC", func(t *testing.T) { + hint := tc.was + checkInvariants(t, hint) + updated := hint.UpdateAfterGC(tc.gced) + checkInvariants(t, hint) + assert.Equal(t, !hint.Equal(tc.was), updated, "returned incorrect 'updated' bit") + assert.Equal(t, tc.want, hint) + }) + } } From 3734a1a98ea46f744c6f12642b616b2a6f99f325 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Mon, 11 Sep 2023 18:23:22 +0100 Subject: [PATCH 8/9] roachpb: test GCHint.ForwardLatestRangeDeleteTimestamp Epic: none Release note: none --- pkg/roachpb/metadata_test.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/roachpb/metadata_test.go b/pkg/roachpb/metadata_test.go index 2cd523a4db17..8930db3fc84a 100644 --- a/pkg/roachpb/metadata_test.go +++ b/pkg/roachpb/metadata_test.go @@ -613,4 +613,25 @@ func TestGCHint(t *testing.T) { assert.Equal(t, tc.want, hint) }) } + + for _, tc := range []struct { + was GCHint + add hlc.Timestamp + want GCHint + }{ + {was: GCHint{}, add: empty, want: GCHint{}}, + {was: hint(ts2, empty, empty), add: empty, want: hint(ts2, empty, empty)}, + {was: hint(ts2, empty, empty), add: ts1, want: hint(ts2, empty, empty)}, + {was: hint(ts2, empty, empty), add: ts2, want: hint(ts2, empty, empty)}, + {was: hint(ts2, empty, empty), add: ts3, want: hint(ts3, empty, empty)}, + } { + t.Run("ForwardLatestRangeDeleteTimestamp", func(t *testing.T) { + hint := tc.was + checkInvariants(t, hint) + updated := hint.ForwardLatestRangeDeleteTimestamp(tc.add) + checkInvariants(t, hint) + assert.Equal(t, !hint.Equal(tc.was), updated, "returned incorrect 'updated' bit") + assert.Equal(t, tc.want, hint) + }) + } } From e28e450c9b84616aaf16867bf990795b166ec861 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Tue, 12 Sep 2023 09:03:12 +0100 Subject: [PATCH 9/9] roachpb: more tests for GCHint.Merge Epic: none Release note: none --- pkg/roachpb/metadata_test.go | 45 +++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/pkg/roachpb/metadata_test.go b/pkg/roachpb/metadata_test.go index 8930db3fc84a..672bb3f6d719 100644 --- a/pkg/roachpb/metadata_test.go +++ b/pkg/roachpb/metadata_test.go @@ -536,7 +536,50 @@ func TestGCHint(t *testing.T) { } }) } - // TODO(pavelkalinnikov): test Merge with non-empty hints. + + ts4 := makeTS(4567, 5) + for _, tc := range []struct { + lhs GCHint + rhs GCHint + lEmpty bool + rEmpty bool + want GCHint + }{ + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts1, empty), want: hint(empty, ts1, ts3)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts1, ts2), want: hint(empty, ts1, ts3)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts1, ts3), want: hint(empty, ts1, ts3)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts1, ts4), want: hint(empty, ts1, ts4)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts2, empty), want: hint(empty, ts1, ts3)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts2, ts3), want: hint(empty, ts1, ts3)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts2, ts4), want: hint(empty, ts1, ts4)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts3, empty), want: hint(empty, ts1, ts3)}, + {lhs: hint(empty, ts1, ts3), rhs: hint(empty, ts3, ts4), want: hint(empty, ts1, ts4)}, + + {lhs: hint(ts1, empty, empty), rhs: hint(ts2, empty, empty), lEmpty: false, rEmpty: false, + want: hint(ts2, empty, empty)}, + {lhs: hint(ts1, empty, empty), rhs: hint(ts2, empty, empty), lEmpty: true, rEmpty: false, + want: hint(ts2, empty, empty)}, + {lhs: hint(ts1, empty, empty), rhs: hint(ts2, empty, empty), lEmpty: false, rEmpty: true, + want: hint(ts2, empty, empty)}, + {lhs: hint(ts1, empty, empty), rhs: hint(ts2, empty, empty), lEmpty: true, rEmpty: true, + want: hint(ts2, empty, empty)}, + + {lhs: hint(ts2, empty, empty), rhs: hint(empty, ts1, ts3), lEmpty: false, rEmpty: false, + want: hint(empty, ts1, ts3)}, + {lhs: hint(ts2, empty, empty), rhs: hint(empty, ts1, ts3), lEmpty: true, rEmpty: false, + want: hint(empty, ts1, ts3)}, + {lhs: hint(ts2, empty, empty), rhs: hint(empty, ts1, ts3), lEmpty: false, rEmpty: true, + want: hint(ts2, ts1, ts3)}, + {lhs: hint(ts2, empty, empty), rhs: hint(empty, ts1, ts3), lEmpty: true, rEmpty: true, + want: hint(ts2, ts1, ts3)}, + } { + t.Run("Merge", func(t *testing.T) { + checkInvariants(t, tc.lhs) + checkInvariants(t, tc.rhs) + got := checkMerge(t, tc.lhs, tc.rhs, tc.lEmpty, tc.rEmpty) + require.Equal(t, tc.want, got) + }) + } for _, tc := range []struct { was GCHint