From 285aead75f77740df357a370aff37e91e579ff8a Mon Sep 17 00:00:00 2001 From: Oleg Afanasyev Date: Wed, 15 Jun 2022 11:37:43 +0100 Subject: [PATCH] kvserver/gc: remove range tombstones during GC Previously range tombstones were taken into account when doing point key GC, but were never removed themselves. This PR is adding support for removal of old range keys. This PR is extending GCRequest to include range tombstones. Range tombstones are populated by GC in requests sent after all the point keys under the GC threshold are deleted thus guaranteeing that point keys are not accidentally exposed. When processing GC range tombstone requests, replica does an addtional step to validate these assumptions and fail deletions if there's data still covered by range tombstones. Release note: None --- pkg/kv/kvserver/batcheval/BUILD.bazel | 2 + pkg/kv/kvserver/batcheval/cmd_delete_range.go | 20 - pkg/kv/kvserver/batcheval/cmd_gc.go | 84 ++- pkg/kv/kvserver/batcheval/ranges.go | 33 + pkg/kv/kvserver/batcheval/ranges_test.go | 117 +++ pkg/kv/kvserver/gc/data_distribution_test.go | 21 +- pkg/kv/kvserver/gc/gc.go | 141 +++- pkg/kv/kvserver/gc/gc_iterator.go | 6 +- pkg/kv/kvserver/gc/gc_old_test.go | 6 +- pkg/kv/kvserver/gc/gc_random_test.go | 223 +++++- pkg/kv/kvserver/gc/gc_test.go | 247 ++++++- pkg/kv/kvserver/metrics.go | 8 + pkg/kv/kvserver/mvcc_gc_queue.go | 2 + pkg/kv/kvserver/mvcc_gc_queue_test.go | 199 +++-- pkg/kv/kvserver/rditer/replica_data_iter.go | 45 +- .../kvserver/rditer/replica_data_iter_test.go | 6 +- pkg/roachpb/api.proto | 9 +- pkg/storage/mvcc.go | 331 ++++++++- pkg/storage/mvcc_test.go | 690 +++++++++++++++++- pkg/ts/catalog/chart_catalog.go | 4 + 20 files changed, 2048 insertions(+), 146 deletions(-) create mode 100644 pkg/kv/kvserver/batcheval/ranges.go create mode 100644 pkg/kv/kvserver/batcheval/ranges_test.go diff --git a/pkg/kv/kvserver/batcheval/BUILD.bazel b/pkg/kv/kvserver/batcheval/BUILD.bazel index 1220bafd9e6c..a62c7057fba0 100644 --- a/pkg/kv/kvserver/batcheval/BUILD.bazel +++ b/pkg/kv/kvserver/batcheval/BUILD.bazel @@ -47,6 +47,7 @@ go_library( "declare.go", "eval_context.go", "intent.go", + "ranges.go", "split_stats_helper.go", "stateloader.go", "transaction.go", @@ -118,6 +119,7 @@ go_test( "declare_test.go", "intent_test.go", "main_test.go", + "ranges_test.go", "testutils_test.go", "transaction_test.go", ], diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range.go b/pkg/kv/kvserver/batcheval/cmd_delete_range.go index 4c063d2692cf..f0c0c52cd82e 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range.go @@ -117,23 +117,3 @@ func DeleteRange( // error is not consumed by the caller because the result will be discarded. return result.FromAcquiredLocks(h.Txn, deleted...), err } - -// rangeTombstonePeekBounds returns the left and right bounds that -// MVCCDeleteRangeUsingTombstone can read in order to detect adjacent range -// tombstones to merge with or fragment. The bounds will be truncated to the -// Raft range bounds if given. -func rangeTombstonePeekBounds( - startKey, endKey, rangeStart, rangeEnd roachpb.Key, -) (roachpb.Key, roachpb.Key) { - leftPeekBound := startKey.Prevish(roachpb.PrevishKeyLength) - if len(rangeStart) > 0 && leftPeekBound.Compare(rangeStart) <= 0 { - leftPeekBound = rangeStart - } - - rightPeekBound := endKey.Next() - if len(rangeEnd) > 0 && rightPeekBound.Compare(rangeEnd) >= 0 { - rightPeekBound = rangeEnd - } - - return leftPeekBound.Clone(), rightPeekBound.Clone() -} diff --git a/pkg/kv/kvserver/batcheval/cmd_gc.go b/pkg/kv/kvserver/batcheval/cmd_gc.go index d09ede767dba..2f9290bf2015 100644 --- a/pkg/kv/kvserver/batcheval/cmd_gc.go +++ b/pkg/kv/kvserver/batcheval/cmd_gc.go @@ -12,6 +12,7 @@ package batcheval import ( "context" + "sort" "time" "github.com/cockroachdb/cockroach/pkg/keys" @@ -44,6 +45,17 @@ func declareKeysGC( latchSpans.AddMVCC(spanset.SpanReadWrite, roachpb.Span{Key: key.Key}, header.Timestamp) } } + // Extend the range key latches by feather to ensure MVCC stats + // computations correctly account for adjacent range keys tombstones if they + // need to be split. + // TODO(oleg): These latches are very broad and will be disruptive to read and + // write operations despite only accessing "stale" data. We should think of + // better integrating it with latchless GC approach. + for _, span := range mergeAdjacentSpans(makeLookupBoundariesForGCRanges(rs.GetStartKey().AsRawKey(), + nil, gcr.RangeKeys)) { + latchSpans.AddMVCC(spanset.SpanReadWrite, span, + header.Timestamp) + } // Be smart here about blocking on the threshold keys. The MVCC GC queue can // send an empty request first to bump the thresholds, and then another one // that actually does work but can avoid declaring these keys below. @@ -54,6 +66,25 @@ func declareKeysGC( latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{Key: keys.RangeDescriptorKey(rs.GetStartKey())}) } +// Create latches and merge adjacent. +func mergeAdjacentSpans(spans []roachpb.Span) []roachpb.Span { + if len(spans) == 0 { + return nil + } + sort.Slice(spans, func(i, j int) bool { + return spans[i].Key.Compare(spans[j].Key) < 0 + }) + j := 0 + for i := 1; i < len(spans); i++ { + if spans[i].Key.Compare(spans[j].EndKey) < 0 { + spans[j].EndKey = spans[i].EndKey + } else { + j++ + } + } + return spans[0 : j+1] +} + // GC iterates through the list of keys to garbage collect // specified in the arguments. MVCCGarbageCollect is invoked on each // listed key along with the expiration timestamp. The GC metadata @@ -85,7 +116,8 @@ func GC( // 2. the read could be served off a follower, which could be applying the // GC request's effect from the raft log. Latches held on the leaseholder // would have no impact on a follower read. - if !args.Threshold.IsEmpty() && len(args.Keys) != 0 && + if !args.Threshold.IsEmpty() && + (len(args.Keys) != 0 || len(args.RangeKeys) != 0) && !cArgs.EvalCtx.EvalKnobs().AllowGCWithNewThresholdAndKeys { return result.Result{}, errors.AssertionFailedf( "GC request can set threshold or it can GC keys, but it is unsafe for it to do both") @@ -119,6 +151,16 @@ func GC( } } + // Garbage collect range keys. Note that we pass latch range boundaries for + // each key as we may need to merge range keys with adjacent ones, but we + // are restricted on how far we are allowed to read. + desc := cArgs.EvalCtx.Desc() + rangeKeys := makeCollectableGCRangesFromGCRequests(desc.StartKey.AsRawKey(), + desc.EndKey.AsRawKey(), args.RangeKeys) + if err := storage.MVCCGarbageCollectRangeKeys(ctx, readWriter, cArgs.Stats, rangeKeys); err != nil { + return result.Result{}, err + } + // Optionally bump the GC threshold timestamp. var res result.Result if !args.Threshold.IsEmpty() { @@ -147,3 +189,43 @@ func GC( return res, nil } + +// makeLookupBoundariesForGCRanges creates spans that could be used for latches +// and iterators when performing range tombstone garbage collection. Each of +// spans includes additional keys to the left and right of the GD'd range to +// ensure merging of range tombstones could be performed and at the same time +// no data is accessed outside of latches. +func makeLookupBoundariesForGCRanges( + rangeStart, rangeEnd roachpb.Key, rangeKeys []roachpb.GCRequest_GCRangeKey, +) []roachpb.Span { + spans := make([]roachpb.Span, len(rangeKeys)) + for i := range rangeKeys { + l, r := rangeTombstonePeekBounds(rangeKeys[i].StartKey, rangeKeys[i].EndKey, rangeStart, rangeEnd) + spans[i] = roachpb.Span{ + Key: l, + EndKey: r, + } + } + return spans +} + +// makeCollectableGCRangesFromGCRequests creates GC collectable ranges +// containing ranges to be removed as well as safe iteration boundaries. +// See makeLookupBoundariesForGCRanges for why additional boundaries are used. +func makeCollectableGCRangesFromGCRequests( + rangeStart, rangeEnd roachpb.Key, rangeKeys []roachpb.GCRequest_GCRangeKey, +) []storage.CollectableGCRangeKey { + latches := makeLookupBoundariesForGCRanges(rangeStart, rangeEnd, rangeKeys) + collectableKeys := make([]storage.CollectableGCRangeKey, len(rangeKeys)) + for i, rk := range rangeKeys { + collectableKeys[i] = storage.CollectableGCRangeKey{ + MVCCRangeKey: storage.MVCCRangeKey{ + StartKey: rk.StartKey, + EndKey: rk.EndKey, + Timestamp: rk.Timestamp, + }, + LatchSpan: latches[i], + } + } + return collectableKeys +} diff --git a/pkg/kv/kvserver/batcheval/ranges.go b/pkg/kv/kvserver/batcheval/ranges.go new file mode 100644 index 000000000000..967fc552ff19 --- /dev/null +++ b/pkg/kv/kvserver/batcheval/ranges.go @@ -0,0 +1,33 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package batcheval + +import "github.com/cockroachdb/cockroach/pkg/roachpb" + +// rangeTombstonePeekBounds returns the left and right bounds that +// MVCCDeleteRangeUsingTombstone can read in order to detect adjacent range +// tombstones to merge with or fragment. The bounds will be truncated to the +// Raft range bounds if given. +func rangeTombstonePeekBounds( + startKey, endKey, rangeStart, rangeEnd roachpb.Key, +) (roachpb.Key, roachpb.Key) { + leftPeekBound := startKey.Prevish(roachpb.PrevishKeyLength) + if len(rangeStart) > 0 && leftPeekBound.Compare(rangeStart) <= 0 { + leftPeekBound = rangeStart + } + + rightPeekBound := endKey.Next() + if len(rangeEnd) > 0 && rightPeekBound.Compare(rangeEnd) >= 0 { + rightPeekBound = rangeEnd + } + + return leftPeekBound.Clone(), rightPeekBound.Clone() +} diff --git a/pkg/kv/kvserver/batcheval/ranges_test.go b/pkg/kv/kvserver/batcheval/ranges_test.go new file mode 100644 index 000000000000..f74c97a75985 --- /dev/null +++ b/pkg/kv/kvserver/batcheval/ranges_test.go @@ -0,0 +1,117 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package batcheval + +import ( + "testing" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/stretchr/testify/require" +) + +func TestMergeGCRangeBoundaries(t *testing.T) { + defer leaktest.AfterTest(t)() + + gcr := func(start, end roachpb.Key) roachpb.GCRequest_GCRangeKey { + return roachpb.GCRequest_GCRangeKey{ + StartKey: start, + EndKey: end, + } + } + span := func(start, end roachpb.Key) roachpb.Span { + return roachpb.Span{ + Key: start, + EndKey: end, + } + } + key := func(k string) roachpb.Key { + return roachpb.Key(k) + } + preKey := func(k string) roachpb.Key { + l, _ := rangeTombstonePeekBounds(key(k), key(k+"zzzzzzz"), nil, nil) + return l + } + postKey := func(k string) roachpb.Key { + _, r := rangeTombstonePeekBounds(key(""), key(k), nil, nil) + return r + } + + for _, d := range []struct { + name string + rangeStart roachpb.Key + rangeEnd roachpb.Key + rangeKeys []roachpb.GCRequest_GCRangeKey + spans []roachpb.Span + }{ + { + name: "empty", + rangeStart: key("a"), + rangeEnd: key("b"), + rangeKeys: []roachpb.GCRequest_GCRangeKey{}, + spans: nil, + }, + { + name: "full range", + rangeStart: key("a"), + rangeEnd: key("b"), + rangeKeys: []roachpb.GCRequest_GCRangeKey{ + gcr(key("a"), key("b")), + }, + spans: []roachpb.Span{ + span(key("a"), key("b")), + }, + }, + { + name: "sub range", + rangeStart: key("a"), + rangeEnd: key("z"), + rangeKeys: []roachpb.GCRequest_GCRangeKey{ + gcr(key("c"), key("d")), + }, + spans: []roachpb.Span{ + span(preKey("c"), postKey("d")), + }, + }, + { + name: "non adjacent", + rangeStart: key("a"), + rangeEnd: key("z"), + rangeKeys: []roachpb.GCRequest_GCRangeKey{ + gcr(key("c"), key("d")), + gcr(key("e"), key("f")), + }, + spans: []roachpb.Span{ + span(preKey("c"), postKey("d")), + span(preKey("e"), postKey("f")), + }, + }, + { + name: "merge adjacent", + rangeStart: key("a"), + rangeEnd: key("z"), + rangeKeys: []roachpb.GCRequest_GCRangeKey{ + gcr(key("a"), key("b")), + gcr(key("b"), key("c")), + gcr(key("c"), key("d")), + }, + spans: []roachpb.Span{ + span(key("a"), postKey("d")), + }, + }, + } { + t.Run(d.name, func(t *testing.T) { + spans := makeLookupBoundariesForGCRanges(d.rangeStart, d.rangeEnd, d.rangeKeys) + merged := mergeAdjacentSpans(spans) + require.Equal(t, d.spans, merged, "combined spans") + }) + } +} diff --git a/pkg/kv/kvserver/gc/data_distribution_test.go b/pkg/kv/kvserver/gc/data_distribution_test.go index 669aa2a9767c..244ff793b7c3 100644 --- a/pkg/kv/kvserver/gc/data_distribution_test.go +++ b/pkg/kv/kvserver/gc/data_distribution_test.go @@ -296,7 +296,7 @@ func newDataDistribution( } retries = 0 } - return nextKey, nil, keyTimestamps, hasIntent + return nextKey, unusedEndKey, keyTimestamps, hasIntent } generateRangeKey := func() (startKey, endKey roachpb.Key, timestamps []hlc.Timestamp, hasIntent bool) { @@ -375,14 +375,27 @@ type distSpec interface { // uniformDistSpec is a distSpec which represents uniform distributions over its // various dimensions. type uniformDistSpec struct { - tsSecFrom, tsSecTo int64 // seconds + tsSecFrom, tsSecTo int64 + // Intents are split into two categories with distinct time ranges. + // All intents have lower timestamp bound to ensure they don't overlap with + // range tombstones since we will not be able to put a range tombstone over + // intent. + // Additionally, we have two time thresholds for intents. This is needed to + // ensure that we have certain fraction of intents GC'able since they lay + // below certain threshold. tsSecMinIntent, tsSecOldIntentTo int64 keySuffixMin, keySuffixMax int valueLenMin, valueLenMax int deleteFrac float64 + // keysPerValue parameters determine number of versions for a key. This number + // includes tombstones and intents which may be present on top of the history. keysPerValueMin, keysPerValueMax int - intentFrac, oldIntentFrac float64 - rangeKeyFrac float64 + // Fractions define how likely is that a key will belong to one of categories. + // If we only had a single version for each key, then that would be fraction + // of total number of objects, but if we have many versions, this value would + // roughly be total objects/avg(keysPerValueMin, keysPerValueMax) * frac. + intentFrac, oldIntentFrac float64 + rangeKeyFrac float64 } var _ distSpec = uniformDistSpec{} diff --git a/pkg/kv/kvserver/gc/gc.go b/pkg/kv/kvserver/gc/gc.go index 151f6c756ad8..21bbff07c058 100644 --- a/pkg/kv/kvserver/gc/gc.go +++ b/pkg/kv/kvserver/gc/gc.go @@ -19,12 +19,14 @@ import ( "context" "fmt" "math" + "sort" "time" "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/storage" @@ -160,7 +162,7 @@ type Info struct { // Stats about the userspace key-values considered, namely the number of // keys with GC'able data, the number of "old" intents and the number of // associated distinct transactions. - NumKeysAffected, IntentsConsidered, IntentTxns int + NumKeysAffected, NumRangeKeysAffected, IntentsConsidered, IntentTxns int // TransactionSpanTotal is the total number of entries in the transaction span. TransactionSpanTotal int // Summary of transactions which were found GCable (assuming that @@ -194,6 +196,13 @@ type Info struct { // AffectedVersionsValBytes is the number of (fully encoded) bytes deleted from values in the storage engine. // See AffectedVersionsKeyBytes for caveats. AffectedVersionsValBytes int64 + // AffectedVersionsRangeKeyBytes is the number of (fully encoded) bytes deleted from range keys. + // For this counter, we don't count start and end key unless all versions are deleted, but we + // do count timestamp size for each version. + AffectedVersionsRangeKeyBytes int64 + // AffectedVersionsRangeValBytes is the number of (fully encoded) bytes deleted from values that + // belong to removed range keys. + AffectedVersionsRangeValBytes int64 } // RunOptions contains collection of limits that GC run applies when performing operations @@ -268,6 +277,10 @@ func Run( if err != nil { return Info{}, err } + err = processReplicatedRangeTombstones(ctx, desc, snap, now, newThreshold, gcer, &info) + if err != nil { + return Info{}, err + } // From now on, all keys processed are range-local and inline (zero timestamp). @@ -739,6 +752,132 @@ func processAbortSpan( }) } +type rangeKeyBatcher struct { + gcer GCer + batchSize int64 + + pending []storage.MVCCRangeKey + pendingSize int64 +} + +// addAndMaybeFlushRangeFragment will try to add fragment to existing batch +// and flush it if batch is full. +// unsafeRangeKeyValues contains all range key values with the same key range +// that has to be GCd. +// To ensure the resulting batch is not too large, we need to account for all +// removed versions. This method will try to include versions from oldest to +// newest and will stop if we either reach batch size or reach the newest +// provided version. Only the last version of this iteration will be flushed. +// If more versions remained after flush, process would be resumed. +func (b *rangeKeyBatcher) addAndMaybeFlushRangeFragment( + ctx context.Context, unsafeRangeKeyValues []storage.MVCCRangeKeyValue, +) error { + maxKey := len(unsafeRangeKeyValues) - 1 + for i := maxKey; i >= 0; i-- { + rk := unsafeRangeKeyValues[i].RangeKey + rangeSize := int64(len(rk.StartKey)) + int64(len(rk.EndKey)) + storage.MVCCVersionTimestampSize + hasData := len(b.pending) > 0 || i < maxKey + if hasData && (b.pendingSize+rangeSize) >= b.batchSize { + // If we need to send a batch, add previous key from history that we + // already accounted for and flush pending. + if i < maxKey { + b.addRangeKey(unsafeRangeKeyValues[i+1].RangeKey) + } + if err := b.flushPendingFragments(ctx); err != nil { + return err + } + } + b.pendingSize += rangeSize + } + b.addRangeKey(unsafeRangeKeyValues[0].RangeKey) + return nil +} + +func (b *rangeKeyBatcher) addRangeKey(unsafeRk storage.MVCCRangeKey) { + if len(b.pending) == 0 { + b.pending = append(b.pending, unsafeRk.Clone()) + return + } + lastFragment := b.pending[len(b.pending)-1] + // If new fragment is adjacent to previous one and has the same timestamp, + // merge fragments. + if lastFragment.EndKey.Equal(unsafeRk.StartKey) && + lastFragment.Timestamp.Equal(unsafeRk.Timestamp) { + lastFragment.EndKey = unsafeRk.EndKey.Clone() + b.pending[len(b.pending)-1] = lastFragment + } else { + b.pending = append(b.pending, unsafeRk.Clone()) + } +} + +func (b *rangeKeyBatcher) flushPendingFragments(ctx context.Context) error { + if pendingCount := len(b.pending); pendingCount > 0 { + toSend := make([]roachpb.GCRequest_GCRangeKey, pendingCount) + for i, rk := range b.pending { + toSend[i] = roachpb.GCRequest_GCRangeKey{ + StartKey: rk.StartKey, + EndKey: rk.EndKey, + Timestamp: rk.Timestamp, + } + } + b.pending = b.pending[:0] + b.pendingSize = 0 + return b.gcer.GC(ctx, nil, toSend) + } + return nil +} + +func processReplicatedRangeTombstones( + ctx context.Context, + desc *roachpb.RangeDescriptor, + snap storage.Reader, + now hlc.Timestamp, + gcThreshold hlc.Timestamp, + gcer GCer, + info *Info, +) error { + iter := rditer.NewReplicaMVCCDataIterator(desc, snap, rditer.ReplicaDataIteratorOptions{ + Reverse: false, + IterKind: storage.MVCCKeyIterKind, + KeyTypes: storage.IterKeyTypeRangesOnly, + }) + defer iter.Close() + + b := rangeKeyBatcher{ + gcer: gcer, + batchSize: KeyVersionChunkBytes, + } + for { + ok, err := iter.Valid() + if err != nil { + return err + } + if !ok { + break + } + rangeKeys := iter.RangeKeys() + + if idx := sort.Search(len(rangeKeys), func(i int) bool { + return rangeKeys[i].RangeKey.Timestamp.LessEq(gcThreshold) + }); idx < len(rangeKeys) { + if err = b.addAndMaybeFlushRangeFragment(ctx, rangeKeys[idx:]); err != nil { + return err + } + info.NumRangeKeysAffected++ + keyBytes := storage.MVCCVersionTimestampSize * int64(len(rangeKeys)-idx) + if idx == 0 { + keyBytes += int64(len(rangeKeys[0].RangeKey.StartKey) + len(rangeKeys[0].RangeKey.EndKey)) + } + info.AffectedVersionsRangeKeyBytes += keyBytes + for _, v := range rangeKeys[idx:] { + info.AffectedVersionsRangeValBytes += int64(len(v.Value)) + } + } + iter.Next() + } + return b.flushPendingFragments(ctx) +} + // batchingInlineGCer is a helper to paginate the GC of inline (i.e. zero // timestamp keys). After creation, keys are added via FlushingAdd(). A // final call to Flush() empties out the buffer when all keys were added. diff --git a/pkg/kv/kvserver/gc/gc_iterator.go b/pkg/kv/kvserver/gc/gc_iterator.go index e5c60e38bdd7..c657033dcfed 100644 --- a/pkg/kv/kvserver/gc/gc_iterator.go +++ b/pkg/kv/kvserver/gc/gc_iterator.go @@ -40,7 +40,11 @@ func makeGCIterator( desc *roachpb.RangeDescriptor, snap storage.Reader, threshold hlc.Timestamp, ) gcIterator { return gcIterator{ - it: rditer.NewReplicaMVCCDataIterator(desc, snap, true /* seekEnd */), + it: rditer.NewReplicaMVCCDataIterator(desc, snap, rditer.ReplicaDataIteratorOptions{ + Reverse: true, + IterKind: storage.MVCCKeyAndIntentsIterKind, + KeyTypes: storage.IterKeyTypePointsAndRanges, + }), threshold: threshold, } } diff --git a/pkg/kv/kvserver/gc/gc_old_test.go b/pkg/kv/kvserver/gc/gc_old_test.go index d70f38099108..1aa8a63562d7 100644 --- a/pkg/kv/kvserver/gc/gc_old_test.go +++ b/pkg/kv/kvserver/gc/gc_old_test.go @@ -51,7 +51,11 @@ func runGCOld( cleanupTxnIntentsAsyncFn CleanupTxnIntentsAsyncFunc, ) (Info, error) { - iter := rditer.NewReplicaMVCCDataIterator(desc, snap, false /* seekEnd */) + iter := rditer.NewReplicaMVCCDataIterator(desc, snap, rditer.ReplicaDataIteratorOptions{ + Reverse: false, + IterKind: storage.MVCCKeyAndIntentsIterKind, + KeyTypes: storage.IterKeyTypePointsOnly, + }) defer iter.Close() // Compute intent expiration (intent age at which we attempt to resolve). diff --git a/pkg/kv/kvserver/gc/gc_random_test.go b/pkg/kv/kvserver/gc/gc_random_test.go index ad4a09fa08a8..07be1a9a3341 100644 --- a/pkg/kv/kvserver/gc/gc_random_test.go +++ b/pkg/kv/kvserver/gc/gc_random_test.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -83,7 +84,7 @@ var ( keysPerValueMax: 100, intentFrac: .1, oldIntentFrac: .1, - rangeKeyFrac: .05, + rangeKeyFrac: .1, } ) @@ -242,6 +243,15 @@ func TestNewVsInvariants(t *testing.T) { ttlSec: 10, intentAgeSec: 15, }, + { + ds: someVersionsWithSomeRangeKeys, + now: hlc.Timestamp{ + WallTime: 100 * time.Second.Nanoseconds(), + }, + // Higher TTL means range tombstones between 70 sec and 50 sec are + // not removed. + ttlSec: 50, + }, } { t.Run(fmt.Sprintf("%v@%v,ttl=%vsec", tc.ds, tc.now, tc.ttlSec), func(t *testing.T) { rng := rand.New(rand.NewSource(1)) @@ -268,7 +278,7 @@ func TestNewVsInvariants(t *testing.T) { // Handle GC + resolve intents. var stats enginepb.MVCCStats require.NoError(t, - storage.MVCCGarbageCollect(ctx, eng, &stats, gcer.requests(), gcThreshold)) + storage.MVCCGarbageCollect(ctx, eng, &stats, gcer.pointKeys(), gcThreshold)) for _, i := range gcer.intents { l := roachpb.LockUpdate{ Span: roachpb.Span{Key: i.Key}, @@ -278,6 +288,12 @@ func TestNewVsInvariants(t *testing.T) { _, err := storage.MVCCResolveWriteIntent(ctx, eng, &stats, l) require.NoError(t, err, "failed to resolve intent") } + for _, batch := range gcer.rangeKeyBatches() { + rangeKeys := makeCollectableGCRangesFromGCRequests(desc.StartKey.AsRawKey(), + desc.EndKey.AsRawKey(), batch) + require.NoError(t, + storage.MVCCGarbageCollectRangeKeys(ctx, eng, &stats, rangeKeys)) + } assertLiveData(t, eng, beforeGC, *desc, tc.now, gcThreshold, intentThreshold, ttl, gcInfoNew) @@ -293,9 +309,13 @@ type historyItem struct { // assertLiveData will create a stream of expected values based on full data // set contained in provided "before" reader and compare it with the "after" // reader that contains data after applying GC request. +// Same process is then repeated with range tombstones. +// For range tombstones, we merge all the fragments before asserting to avoid +// any dependency on how key splitting is done inside pebble. // Generated expected values are produces by simulating GC in a naive way where // each value is considered live if: // - it is a value or tombstone and its timestamp is higher than gc threshold +// - it is a range tombstone and its timestamp is higher than gc threshold // - it is a first value at or below gc threshold and there are no deletions // between gc threshold and the value func assertLiveData( @@ -330,8 +350,19 @@ func assertLiveData( pointExpectationsGenerator := getExpectationsGenerator(t, pointIt, gcThreshold, intentThreshold, &expInfo) + rangeIt := before.NewMVCCIterator(storage.MVCCKeyIterKind, + storage.IterOptions{ + LowerBound: desc.StartKey.AsRawKey(), + UpperBound: desc.EndKey.AsRawKey(), + KeyTypes: storage.IterKeyTypeRangesOnly, + }) + defer rangeIt.Close() + rangeIt.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) + expectedRanges := mergeRanges(filterRangeFragments(rangeFragmentsFromIt(t, rangeIt), gcThreshold, + &expInfo)) + // Loop over engine data after applying GCer requests and compare with - // expected ranges. + // expected point keys. itAfter := after.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ LowerBound: desc.StartKey.AsRawKey(), UpperBound: desc.EndKey.AsRawKey(), @@ -369,6 +400,28 @@ func assertLiveData( } } + rangeItAfter := after.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ + LowerBound: desc.StartKey.AsRawKey(), + UpperBound: desc.EndKey.AsRawKey(), + KeyTypes: storage.IterKeyTypeRangesOnly, + RangeKeyMaskingBelow: hlc.Timestamp{}, + }) + defer rangeItAfter.Close() + rangeItAfter.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) + actualRanges := mergeRanges(rangeFragmentsFromIt(t, rangeItAfter)) + + // Be careful when enabling logging on tests with default large N, + // 1000 elements is ok, but 10k or 100k entries might become unreadable. + if log.V(1) { + ctx := context.Background() + log.Info(ctx, "Expected data:") + for _, l := range formatTable(engineData(t, before, desc), desc.StartKey.AsRawKey()) { + log.Infof(ctx, "%s", l) + } + } + + require.EqualValues(t, expectedRanges, actualRanges, "GC'd range tombstones") + require.EqualValues(t, expInfo, gcInfo, "collected gc info mismatch") } @@ -554,12 +607,121 @@ func getKeyHistory(t *testing.T, r storage.Reader, key roachpb.Key) string { return strings.Join(result, ", ") } +func rangeFragmentsFromIt(t *testing.T, it storage.MVCCIterator) [][]storage.MVCCRangeKeyValue { + var result [][]storage.MVCCRangeKeyValue + for { + ok, err := it.Valid() + require.NoError(t, err, "failed to read range tombstones from iterator") + if !ok { + break + } + _, r := it.HasPointAndRange() + if r { + fragments := make([]storage.MVCCRangeKeyValue, len(it.RangeKeys())) + for i, r := range it.RangeKeys() { + fragments[i] = r.Clone() + } + result = append(result, fragments) + } + it.NextKey() + } + return result +} + +// Filter all fragments that match GC criteria and update gcinfo accordingly. +func filterRangeFragments( + fragments [][]storage.MVCCRangeKeyValue, gcThreshold hlc.Timestamp, expInfo *Info, +) [][]storage.MVCCRangeKeyValue { + var result [][]storage.MVCCRangeKeyValue + for _, stack := range fragments { + var newStack []storage.MVCCRangeKeyValue + for i, r := range stack { + if r.RangeKey.Timestamp.LessEq(gcThreshold) { + // Update expectations: + // On lowest range timestamp bump range counter. + if i == len(stack)-1 { + expInfo.NumRangeKeysAffected++ + } + // If all fragments are deleted then keys bytes are accounted for. + if i == 0 { + expInfo.AffectedVersionsRangeKeyBytes += int64(len(r.RangeKey.StartKey) + len(r.RangeKey.EndKey)) + } + // Count timestamps for all versions of range keys. + expInfo.AffectedVersionsRangeKeyBytes += storage.MVCCVersionTimestampSize + expInfo.AffectedVersionsRangeValBytes += int64(len(r.Value)) + } else { + newStack = append(newStack, r) + } + } + if len(newStack) > 0 { + result = append(result, newStack) + } + } + return result +} + +func mergeRanges(fragments [][]storage.MVCCRangeKeyValue) []storage.MVCCRangeKeyValue { + var result []storage.MVCCRangeKeyValue + var partialRangeKeys []storage.MVCCRangeKeyValue + var lastEnd roachpb.Key + for _, stack := range fragments { + start, end := stack[0].RangeKey.StartKey, stack[0].RangeKey.EndKey + if lastEnd.Equal(start) { + // Try merging keys by timestamp. + var newPartial []storage.MVCCRangeKeyValue + i, j := 0, 0 + for i < len(stack) && j < len(partialRangeKeys) { + switch stack[i].RangeKey.Timestamp.Compare(partialRangeKeys[j].RangeKey.Timestamp) { + case 1: + newPartial = append(newPartial, stack[i]) + i++ + case 0: + // We don't compare range values here as it would complicate things + // too much and not worth for this test as we don't expect mixed + // tombstone types. + newPartial = append(newPartial, storage.MVCCRangeKeyValue{ + RangeKey: storage.MVCCRangeKey{ + StartKey: partialRangeKeys[j].RangeKey.StartKey, + EndKey: stack[i].RangeKey.EndKey, + Timestamp: partialRangeKeys[j].RangeKey.Timestamp, + }, + Value: partialRangeKeys[j].Value, + }) + i++ + j++ + case -1: + newPartial = append(newPartial, partialRangeKeys[j]) + j++ + } + } + for ; i < len(stack); i++ { + newPartial = append(newPartial, stack[i]) + } + for ; j < len(partialRangeKeys); j++ { + newPartial = append(newPartial, partialRangeKeys[j]) + } + partialRangeKeys = newPartial + } else { + result = append(result, partialRangeKeys...) + partialRangeKeys = make([]storage.MVCCRangeKeyValue, 0, len(stack)) + partialRangeKeys = append(partialRangeKeys, stack...) + } + lastEnd = end + } + result = append(result, partialRangeKeys...) + return result +} + type fakeGCer struct { - gcKeys map[string]roachpb.GCRequest_GCKey - threshold Threshold - intents []roachpb.Intent - batches [][]roachpb.Intent - txnIntents []txnIntents + gcKeys map[string]roachpb.GCRequest_GCKey + // fake GCer stores range key batches as it since we need to be able to + // feed them into MVCCGarbageCollectRangeKeys and ranges argument should be + // non-overlapping. + gcRangeKeyBatches [][]roachpb.GCRequest_GCRangeKey + threshold Threshold + intents []roachpb.Intent + batches [][]roachpb.Intent + txnIntents []txnIntents } func makeFakeGCer() fakeGCer { @@ -581,6 +743,7 @@ func (f *fakeGCer) GC( for _, k := range keys { f.gcKeys[k.Key.String()] = k } + f.gcRangeKeyBatches = append(f.gcRangeKeyBatches, rangeKeys) return nil } @@ -609,7 +772,7 @@ func (f *fakeGCer) normalize() { f.batches = nil } -func (f *fakeGCer) requests() []roachpb.GCRequest_GCKey { +func (f *fakeGCer) pointKeys() []roachpb.GCRequest_GCKey { var reqs []roachpb.GCRequest_GCKey for _, r := range f.gcKeys { reqs = append(reqs, r) @@ -617,6 +780,18 @@ func (f *fakeGCer) requests() []roachpb.GCRequest_GCKey { return reqs } +func (f *fakeGCer) rangeKeyBatches() [][]roachpb.GCRequest_GCRangeKey { + return f.gcRangeKeyBatches +} + +func (f *fakeGCer) rangeKeys() []roachpb.GCRequest_GCRangeKey { + var reqs []roachpb.GCRequest_GCRangeKey + for _, r := range f.gcRangeKeyBatches { + reqs = append(reqs, r...) + } + return reqs +} + func intentLess(a, b *roachpb.Intent) bool { cmp := a.Key.Compare(b.Key) switch { @@ -633,3 +808,33 @@ type txnIntents struct { txn *roachpb.Transaction intents []roachpb.LockUpdate } + +// makeCollectableGCRangesFromGCRequests mirrors +// MakeCollectableGCRangesFromGCRequests to break cyclic dependecies. +func makeCollectableGCRangesFromGCRequests( + rangeStart, rangeEnd roachpb.Key, rangeKeys []roachpb.GCRequest_GCRangeKey, +) []storage.CollectableGCRangeKey { + collectableKeys := make([]storage.CollectableGCRangeKey, len(rangeKeys)) + for i, rk := range rangeKeys { + leftPeekBound := rk.StartKey.Prevish(roachpb.PrevishKeyLength) + if len(rangeStart) > 0 && leftPeekBound.Compare(rangeStart) <= 0 { + leftPeekBound = rangeStart + } + rightPeekBound := rk.EndKey.Next() + if len(rangeEnd) > 0 && rightPeekBound.Compare(rangeEnd) >= 0 { + rightPeekBound = rangeEnd + } + collectableKeys[i] = storage.CollectableGCRangeKey{ + MVCCRangeKey: storage.MVCCRangeKey{ + StartKey: rk.StartKey, + EndKey: rk.EndKey, + Timestamp: rk.Timestamp, + }, + LatchSpan: roachpb.Span{ + Key: leftPeekBound, + EndKey: rightPeekBound, + }, + } + } + return collectableKeys +} diff --git a/pkg/kv/kvserver/gc/gc_test.go b/pkg/kv/kvserver/gc/gc_test.go index b73a66479f52..29b5145b0def 100644 --- a/pkg/kv/kvserver/gc/gc_test.go +++ b/pkg/kv/kvserver/gc/gc_test.go @@ -573,8 +573,8 @@ var deleteRangeData = ` 8 | 7 | 6 | *- ->5 | *- - 4 | *- +>5 | .- + 4 | .- 3 | 2 | a b C 1 | @@ -587,8 +587,8 @@ var deleteRangeDataWithNewerValues = ` 8 | A C E *--- 7 | 6 | *-G ->5 | *- - 4 | *- I +>5 | .- + 4 | .- I 3 | 2 | b d F H i 1 | @@ -602,7 +602,7 @@ var deleteRangeMultipleValues = ` 7 | 6 | *--- >5 | - 4 | *- + 4 | .- 3 | 2 | a B C 1 | @@ -615,8 +615,8 @@ var deleteRangeDataWithIntents = ` 8 | !A !C !E 7 | 6 | *-- ->5 | *-- - 4 | *-- +>5 | .-- + 4 | .-- 3 | 2 | b d F 1 | @@ -629,16 +629,59 @@ var differentRangeStacksPerPoint = ` ---+--------------- 9 | >8 | B3 - 7 | *---------- + 7 | .---------- 6 | b2 - 5 | *---------- + 5 | .---------- 4 | a2 b1 - 3 | *---------- + 3 | .---------- 2 | a1 1 | ` +var deleteFragmentedRanges = ` + | a b c d e f g h i j +---+---------------------- + 9 | + 8 | A C F + 7 | + 6 | +>5 | .-- + 4 | d + 3 | .-------- + 2 | b f g + 1 | +` + +var deleteMergesRanges = ` + | a bb ccc d +---+--------------- + 9 | + 8 | A B F + 7 | *---------- + 6 | *---------- +>5 | .-- + 4 | + 3 | c + 2 | + 1 | +` + +var avoidMergingDifferentTs = ` + | a bb ccc d e +---+--------------- + 9 | + 8 | *-- + 7 | *-- + 6 | +>5 | .----- + 4 | + 3 | + 2 | + 1 | +` + func TestGC(t *testing.T) { + defer leaktest.AfterTest(t)() for _, d := range []struct { name string data string @@ -657,6 +700,9 @@ func TestGC(t *testing.T) { {name: "delete_range_multiple_points", data: deleteRangeMultipleValues}, {name: "delete_range_with_intents", data: deleteRangeDataWithIntents}, {name: "delete_with_different_range_stacks", data: differentRangeStacksPerPoint}, + {name: "delete_fragments_ranges", data: deleteFragmentedRanges}, + {name: "delete_merges_rages", data: deleteMergesRanges}, + {name: "avoid_merging_different_ts", data: avoidMergingDifferentTs}, } { t.Run(d.name, func(t *testing.T) { runTest(t, d.data) @@ -688,10 +734,18 @@ func runTest(t *testing.T, data string) { gcer.resolveIntents, gcer.resolveIntentsAsync) require.NoError(t, err) require.Empty(t, gcer.intents, "expecting no intents") - require.NoError(t, storage.MVCCGarbageCollect(ctx, eng, &stats, gcer.requests(), gcTS)) + require.NoError(t, + storage.MVCCGarbageCollect(ctx, eng, &stats, gcer.pointKeys(), gcTS)) + + for _, batch := range gcer.rangeKeyBatches() { + rangeKeys := makeCollectableGCRangesFromGCRequests(desc.StartKey.AsRawKey(), + desc.EndKey.AsRawKey(), batch) + require.NoError(t, + storage.MVCCGarbageCollectRangeKeys(ctx, eng, &stats, rangeKeys)) + } ctrlEng := storage.NewDefaultInMemForTesting() - defer eng.Close() + defer ctrlEng.Close() expectedStats := dataItems.liveDistribution().setupTest(t, ctrlEng, desc) if log.V(1) { @@ -718,6 +772,8 @@ func runTest(t *testing.T, data string) { func requireEqualReaders( t *testing.T, exected storage.Reader, actual storage.Reader, desc roachpb.RangeDescriptor, ) { + // First compare only points. We assert points and ranges separately for + // simplicity. itExp := exected.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ LowerBound: desc.StartKey.AsRawKey(), UpperBound: desc.EndKey.AsRawKey(), @@ -725,6 +781,7 @@ func requireEqualReaders( RangeKeyMaskingBelow: hlc.Timestamp{}, }) defer itExp.Close() + itExp.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) itActual := actual.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ LowerBound: desc.StartKey.AsRawKey(), @@ -733,8 +790,8 @@ func requireEqualReaders( RangeKeyMaskingBelow: hlc.Timestamp{}, }) defer itActual.Close() - itExp.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) itActual.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) + for { okExp, err := itExp.Valid() require.NoError(t, err, "failed to iterate values") @@ -750,10 +807,43 @@ func requireEqualReaders( itActual.UnsafeKey()) require.True(t, bytes.Equal(itExp.UnsafeValue(), itActual.UnsafeValue()), "expected value not equal to actual for key %s", itExp.UnsafeKey()) - itExp.Next() itActual.Next() } + + // Compare only ranges. + itExpRanges := exected.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ + LowerBound: desc.StartKey.AsRawKey(), + UpperBound: desc.EndKey.AsRawKey(), + KeyTypes: storage.IterKeyTypeRangesOnly, + RangeKeyMaskingBelow: hlc.Timestamp{}, + }) + defer itExpRanges.Close() + itExpRanges.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) + + itActualRanges := actual.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{ + LowerBound: desc.StartKey.AsRawKey(), + UpperBound: desc.EndKey.AsRawKey(), + KeyTypes: storage.IterKeyTypeRangesOnly, + RangeKeyMaskingBelow: hlc.Timestamp{}, + }) + defer itActualRanges.Close() + itActualRanges.SeekGE(storage.MVCCKey{Key: desc.StartKey.AsRawKey()}) + + for { + okExp, err := itExpRanges.Valid() + require.NoError(t, err, "failed to iterate ranges") + okAct, err := itActualRanges.Valid() + require.NoError(t, err, "failed to iterate ranges") + if !okExp && !okAct { + break + } + + require.Equal(t, okExp, okAct, "range iterators have different number of elements") + require.EqualValues(t, itExpRanges.RangeKeys(), itActualRanges.RangeKeys(), "range keys") + itExpRanges.Next() + itActualRanges.Next() + } } // dataItem is element read from test table containing mvcc key value along with @@ -1222,3 +1312,132 @@ func formatTable(data []tableCell, prefix roachpb.Key) []string { } return result } + +func TestRangeKeyBatching(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + + mkKey := func(key string) roachpb.Key { + var k roachpb.Key + k = append(k, keys.SystemSQLCodec.TablePrefix(42)...) + k = append(k, key...) + return k + } + + mkKvs := func(start, end string, tss ...int) []storage.MVCCRangeKeyValue { + var result []storage.MVCCRangeKeyValue + for _, ts := range tss { + result = append(result, storage.MVCCRangeKeyValue{ + RangeKey: storage.MVCCRangeKey{ + StartKey: mkKey(start), + EndKey: mkKey(end), + Timestamp: hlc.Timestamp{ + WallTime: int64(ts) * time.Second.Nanoseconds(), + }, + }, + }) + } + return result + } + + mkGCr := func(start, end string, ts int) roachpb.GCRequest_GCRangeKey { + return roachpb.GCRequest_GCRangeKey{ + StartKey: mkKey(start), + EndKey: mkKey(end), + Timestamp: hlc.Timestamp{ + WallTime: int64(ts) * time.Second.Nanoseconds(), + }, + } + } + + for _, data := range []struct { + name string + data [][]storage.MVCCRangeKeyValue + batchSize int64 + expect []roachpb.GCRequest_GCRangeKey + }{ + { + name: "single batch", + data: [][]storage.MVCCRangeKeyValue{ + mkKvs("a", "b", 5, 3, 1), + mkKvs("c", "d", 5, 2, 1), + }, + batchSize: 99999, + expect: []roachpb.GCRequest_GCRangeKey{ + mkGCr("a", "b", 5), + mkGCr("c", "d", 5), + }, + }, + { + name: "merge adjacent", + data: [][]storage.MVCCRangeKeyValue{ + mkKvs("a", "b", 5, 3, 1), + mkKvs("b", "c", 5, 2), + mkKvs("c", "d", 3, 2), + }, + batchSize: 99999, + expect: []roachpb.GCRequest_GCRangeKey{ + mkGCr("a", "c", 5), + mkGCr("c", "d", 3), + }, + }, + { + name: "batch split stack", + data: [][]storage.MVCCRangeKeyValue{ + mkKvs("a", "b", 5, 3, 1), + mkKvs("b", "c", 5, 2), + mkKvs("c", "d", 3, 2), + }, + batchSize: 40, // We could only fit 2 keys in a batch. + expect: []roachpb.GCRequest_GCRangeKey{ + mkGCr("a", "b", 3), + mkGCr("a", "b", 5), + mkGCr("b", "c", 2), + mkGCr("b", "c", 5), + mkGCr("c", "d", 2), + mkGCr("c", "d", 3), + }, + }, + { + name: "batch split keys", + data: [][]storage.MVCCRangeKeyValue{ + mkKvs("a", "b", 5, 3, 1), + mkKvs("b", "c", 5, 2, 1), + mkKvs("c", "d", 3, 2), + }, + batchSize: 50, // We could only fit 3 keys in a batch. + expect: []roachpb.GCRequest_GCRangeKey{ + mkGCr("a", "b", 5), + mkGCr("b", "c", 5), + mkGCr("c", "d", 3), + }, + }, + { + name: "batch split and merge", + data: [][]storage.MVCCRangeKeyValue{ + mkKvs("a", "b", 5, 3), + mkKvs("b", "c", 5, 2), + mkKvs("c", "d", 5, 1), + }, + batchSize: 85, // We could only fit 5 keys in a batch. + expect: []roachpb.GCRequest_GCRangeKey{ + mkGCr("a", "c", 5), + mkGCr("c", "d", 1), + mkGCr("c", "d", 5), + }, + }, + } { + t.Run(data.name, func(t *testing.T) { + gcer := makeFakeGCer() + b := rangeKeyBatcher{ + gcer: &gcer, + batchSize: data.batchSize, + } + for _, d := range data.data { + require.NoError(t, b.addAndMaybeFlushRangeFragment(ctx, d), "failed to gc ranges") + } + require.NoError(t, b.flushPendingFragments(ctx), "failed to gc ranges") + require.EqualValues(t, data.expect, gcer.rangeKeys()) + }) + } +} diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 9adfaf4e4020..13da637c04e8 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1144,6 +1144,12 @@ difficult to meaningfully interpret this metric.`, Measurement: "Keys", Unit: metric.Unit_COUNT, } + metaGCNumRangeKeysAffected = metric.Metadata{ + Name: "queue.gc.info.numrangekeysaffected", + Help: "Number of range keys GC'able", + Measurement: "Range Keys", + Unit: metric.Unit_COUNT, + } metaGCIntentsConsidered = metric.Metadata{ Name: "queue.gc.info.intentsconsidered", Help: "Number of 'old' intents", @@ -1658,6 +1664,7 @@ type StoreMetrics struct { // GCInfo cumulative totals. GCNumKeysAffected *metric.Counter + GCNumRangeKeysAffected *metric.Counter GCIntentsConsidered *metric.Counter GCIntentTxns *metric.Counter GCTransactionSpanScanned *metric.Counter @@ -2136,6 +2143,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { // GCInfo cumulative totals. GCNumKeysAffected: metric.NewCounter(metaGCNumKeysAffected), + GCNumRangeKeysAffected: metric.NewCounter(metaGCNumRangeKeysAffected), GCIntentsConsidered: metric.NewCounter(metaGCIntentsConsidered), GCIntentTxns: metric.NewCounter(metaGCIntentTxns), GCTransactionSpanScanned: metric.NewCounter(metaGCTransactionSpanScanned), diff --git a/pkg/kv/kvserver/mvcc_gc_queue.go b/pkg/kv/kvserver/mvcc_gc_queue.go index 5a42f2cf8303..23b3756db078 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue.go +++ b/pkg/kv/kvserver/mvcc_gc_queue.go @@ -490,6 +490,7 @@ func (r *replicaGCer) GC( } req := r.template() req.Keys = keys + req.RangeKeys = rangeKeys return r.send(ctx, req) } @@ -622,6 +623,7 @@ func (mgcq *mvccGCQueue) process( func updateStoreMetricsWithGCInfo(metrics *StoreMetrics, info gc.Info) { metrics.GCNumKeysAffected.Inc(int64(info.NumKeysAffected)) + metrics.GCNumRangeKeysAffected.Inc(int64(info.NumRangeKeysAffected)) metrics.GCIntentsConsidered.Inc(int64(info.IntentsConsidered)) metrics.GCIntentTxns.Inc(int64(info.IntentTxns)) metrics.GCTransactionSpanScanned.Inc(int64(info.TransactionSpanTotal)) diff --git a/pkg/kv/kvserver/mvcc_gc_queue_test.go b/pkg/kv/kvserver/mvcc_gc_queue_test.go index 9e8ca614c722..85b63c70b7b6 100644 --- a/pkg/kv/kvserver/mvcc_gc_queue_test.go +++ b/pkg/kv/kvserver/mvcc_gc_queue_test.go @@ -15,6 +15,7 @@ import ( "fmt" "math/rand" "reflect" + "sort" "sync/atomic" "testing" "testing/quick" @@ -500,55 +501,99 @@ func TestMVCCGCQueueProcess(t *testing.T) { key9 := mkKey("i") key10 := mkKey("j") key11 := mkKey("k") + key12 := mkKey("l") + key13 := mkKey("m") + key14 := mkKey("n") + key15 := mkKey("o") + + type kvData struct { + key roachpb.Key + endKey roachpb.Key + ts hlc.Timestamp + del bool + txn bool + } - data := []struct { - key roachpb.Key - ts hlc.Timestamp - del bool - txn bool - }{ + mkVal := func(key roachpb.Key, ts hlc.Timestamp) kvData { + return kvData{key: key, ts: ts} + } + mkDel := func(key roachpb.Key, ts hlc.Timestamp) kvData { + return kvData{key: key, ts: ts, del: true} + } + mkTxn := func(data kvData) kvData { + data.txn = true + return data + } + mkRng := func(key, endKey roachpb.Key, ts hlc.Timestamp) kvData { + return kvData{key: key, endKey: endKey, ts: ts, del: true} + } + + data := []kvData{ // For key1, we expect first value to GC. - {key1, ts1, false, false}, - {key1, ts2, false, false}, - {key1, ts5, false, false}, + mkVal(key1, ts1), + mkVal(key1, ts2), + mkVal(key1, ts5), // For key2, we expect values to GC, even though most recent is deletion. - {key2, ts1, false, false}, - {key2, ts2m1, false, false}, // use a value < the GC time to verify it's kept - {key2, ts5, true, false}, + mkVal(key2, ts1), + mkVal(key2, ts2m1), // use a value < the GC time to verify it's kept + mkDel(key2, ts5), // For key3, we expect just ts1 to GC, because most recent deletion is intent. - {key3, ts1, false, false}, - {key3, ts2, false, false}, - {key3, ts5, true, true}, + mkVal(key3, ts1), + mkVal(key3, ts2), + mkTxn(mkDel(key3, ts5)), // For key4, expect oldest value to GC. - {key4, ts1, false, false}, - {key4, ts2, false, false}, + mkVal(key4, ts1), + mkVal(key4, ts2), // For key5, expect all values to GC (most recent value deleted). - {key5, ts1, false, false}, - {key5, ts2, true, false}, // deleted, so GC + mkVal(key5, ts1), + mkDel(key5, ts2), // deleted, so GC // For key6, expect no values to GC because most recent value is intent. - {key6, ts1, false, false}, - {key6, ts5, false, true}, + mkVal(key6, ts1), + mkTxn(mkVal(key6, ts5)), // For key7, expect no values to GC because intent is exactly 2h old. - {key7, ts2, false, false}, - {key7, ts4, false, true}, + mkVal(key7, ts2), + mkTxn(mkVal(key7, ts4)), // For key8, expect most recent value to resolve by aborting, which will clean it up. - {key8, ts2, false, false}, - {key8, ts3, true, true}, + mkVal(key8, ts2), + mkTxn(mkDel(key8, ts3)), // For key9, resolve naked intent with no remaining values. - {key9, ts3, false, true}, + mkTxn(mkVal(key9, ts3)), // For key10, GC ts1 because it's a delete but not ts3 because it's above the threshold. - {key10, ts1, true, false}, - {key10, ts3, true, false}, - {key10, ts4, false, false}, - {key10, ts5, false, false}, + mkDel(key10, ts1), + mkDel(key10, ts3), + mkVal(key10, ts4), + mkVal(key10, ts5), // For key11, we can't GC anything because ts1 isn't a delete. - {key11, ts1, false, false}, - {key11, ts3, true, false}, - {key11, ts4, true, false}, - {key11, ts5, true, false}, + mkVal(key11, ts1), + mkDel(key11, ts3), + mkDel(key11, ts4), + mkDel(key11, ts5), + // key12 has its older version covered by range tombstone and should be GCd + mkVal(key12, ts1), + mkVal(key12, ts5), + // key13 has all versions covered by range tombstone + mkVal(key13, ts1), + // This is old range tombstone below gc threshold + mkRng(key12, key14, ts2), + // This is newer range tombstone above gc threshold + mkRng(key13, key15, ts3), } + sort.Slice(data, func(i, j int) bool { + return data[i].ts.Less(data[j].ts) + }) + for i, datum := range data { + if len(datum.endKey) > 0 { + drArgs := deleteRangeArgs(datum.key, datum.endKey) + drArgs.UseRangeTombstone = true + if _, err := tc.SendWrappedWith(roachpb.Header{ + Timestamp: datum.ts, + }, &drArgs); err != nil { + t.Fatalf("%d: could not delete data: %+v", i, err) + } + continue + } if datum.del { dArgs := deleteArgs(datum.key) var txn *roachpb.Transaction @@ -566,23 +611,23 @@ func TestMVCCGCQueueProcess(t *testing.T) { }, &dArgs); err != nil { t.Fatalf("%d: could not delete data: %+v", i, err) } - } else { - pArgs := putArgs(datum.key, []byte("value")) - var txn *roachpb.Transaction - if datum.txn { - txn = newTransaction("test", datum.key, 1, tc.Clock()) - // Overwrite the timestamps set by newTransaction(). - txn.ReadTimestamp = datum.ts - txn.WriteTimestamp = datum.ts - txn.MinTimestamp = datum.ts - assignSeqNumsForReqs(txn, &pArgs) - } - if _, err := tc.SendWrappedWith(roachpb.Header{ - Timestamp: datum.ts, - Txn: txn, - }, &pArgs); err != nil { - t.Fatalf("%d: could not put data: %+v", i, err) - } + continue + } + pArgs := putArgs(datum.key, []byte("value")) + var txn *roachpb.Transaction + if datum.txn { + txn = newTransaction("test", datum.key, 1, tc.Clock()) + // Overwrite the timestamps set by newTransaction(). + txn.ReadTimestamp = datum.ts + txn.WriteTimestamp = datum.ts + txn.MinTimestamp = datum.ts + assignSeqNumsForReqs(txn, &pArgs) + } + if _, err := tc.SendWrappedWith(roachpb.Header{ + Timestamp: datum.ts, + Txn: txn, + }, &pArgs); err != nil { + t.Fatalf("%d: could not put data: %+v", i, err) } } @@ -591,18 +636,30 @@ func TestMVCCGCQueueProcess(t *testing.T) { t.Fatal(err) } - // The total size of the GC'able versions of the keys and values in Info. - // Key size: len(scratch+"a") + MVCCVersionTimestampSize (13 bytes) = 15 bytes. - // Value size: len("value") + headerSize (5 bytes) = 10 bytes. - // key1 at ts1 (15 bytes) => "value" (10 bytes) - // key2 at ts1 (15 bytes) => "value" (10 bytes) - // key3 at ts1 (15 bytes) => "value" (10 bytes) - // key4 at ts1 (15 bytes) => "value" (10 bytes) - // key5 at ts1 (15 bytes) => "value" (10 bytes) - // key5 at ts2 (15 bytes) => delete (0 bytes) - // key10 at ts1 (15 bytes) => delete (0 bytes) - var expectedVersionsKeyBytes int64 = 7 * 15 - var expectedVersionsValBytes int64 = 5 * 10 + // TODO: following computations should take care of new local timestamp + // for tombstones as they can be non-zero in size. + var ( + // The total size of the GC'able versions of the keys and values in Info. + // Key size: len(scratch+"a") + 1 + MVCCVersionTimestampSize (12 bytes) = 15 bytes. + // Value size: len("value") + headerSize (5 bytes) = 10 bytes. + // key1 at ts1 (15 bytes) => "value" (10 bytes) + // key2 at ts1 (15 bytes) => "value" (10 bytes) + // key3 at ts1 (15 bytes) => "value" (10 bytes) + // key4 at ts1 (15 bytes) => "value" (10 bytes) + // key5 at ts1 (15 bytes) => "value" (10 bytes) + // key5 at ts2 (15 bytes) => delete (0 bytes) + // key10 at ts1 (15 bytes) => delete (0 bytes) + // key12 at ts1 (15 bytes) => "value" (10 bytes) + // key13 at ts1 (15 bytes) => "value" (10 bytes) + expectedVersionsKeyBytes int64 = 9 * 15 + expectedVersionsValBytes int64 = 7 * 10 + // Range Key size: len(scratch + "x") * 2 + timestamp (12) + // Range Value size: 0 for deletion + // key13, key14 at ts1 (12) => delete (0 bytes) + // key14, key15 at ts1 (16) => delete (0 bytes) + expectedVersionsRangeKeyBytes int64 = 12 + 16 + expectedVersionsRangeValBytes int64 = 0 + ) // Call Run with dummy functions to get current Info. gcInfo, err := func() (gc.Info, error) { @@ -630,10 +687,20 @@ func TestMVCCGCQueueProcess(t *testing.T) { t.Fatal(err) } if gcInfo.AffectedVersionsKeyBytes != expectedVersionsKeyBytes { - t.Errorf("expected total keys size: %d bytes; got %d bytes", expectedVersionsKeyBytes, gcInfo.AffectedVersionsKeyBytes) + t.Errorf("expected total keys size: %d bytes; got %d bytes", expectedVersionsKeyBytes, + gcInfo.AffectedVersionsKeyBytes) } if gcInfo.AffectedVersionsValBytes != expectedVersionsValBytes { - t.Errorf("expected total values size: %d bytes; got %d bytes", expectedVersionsValBytes, gcInfo.AffectedVersionsValBytes) + t.Errorf("expected total values size: %d bytes; got %d bytes", expectedVersionsValBytes, + gcInfo.AffectedVersionsValBytes) + } + if gcInfo.AffectedVersionsRangeKeyBytes != expectedVersionsRangeKeyBytes { + t.Errorf("expected total range key size: %d bytes; got %d bytes", expectedVersionsRangeKeyBytes, + gcInfo.AffectedVersionsRangeKeyBytes) + } + if gcInfo.AffectedVersionsRangeValBytes != expectedVersionsRangeValBytes { + t.Errorf("expected total range value size: %d bytes; got %d bytes", expectedVersionsRangeValBytes, + gcInfo.AffectedVersionsRangeValBytes) } // Process through a scan queue. @@ -670,6 +737,7 @@ func TestMVCCGCQueueProcess(t *testing.T) { {key11, ts4}, {key11, ts3}, {key11, ts1}, + {key12, ts5}, } // Read data directly from engine to avoid intent errors from MVCC. // However, because the GC processing pushes transactions and @@ -694,6 +762,7 @@ func TestMVCCGCQueueProcess(t *testing.T) { } log.VEventf(ctx, 2, "%d: %s", i, kv.Key) } + t.Log("success") return nil }) } diff --git a/pkg/kv/kvserver/rditer/replica_data_iter.go b/pkg/kv/kvserver/rditer/replica_data_iter.go index 72f2ae08c5c1..c199cf6864ae 100644 --- a/pkg/kv/kvserver/rditer/replica_data_iter.go +++ b/pkg/kv/kvserver/rditer/replica_data_iter.go @@ -22,6 +22,16 @@ type KeyRange struct { Start, End roachpb.Key } +// ReplicaDataIteratorOptions defines ReplicaMVCCDataIterator creation options. +type ReplicaDataIteratorOptions struct { + // See NewReplicaMVCCDataIterator for details. + Reverse bool + // IterKind is passed to underlying iterator to select desired value types. + IterKind storage.MVCCIterKind + // KeyTypes is passed to underlying iterator to select desired key types. + KeyTypes storage.IterKeyType +} + // ReplicaMVCCDataIterator provides a complete iteration over MVCC or unversioned // (which can be made to look like an MVCCKey) key / value // rows in a range, including system-local metadata and user data. @@ -36,14 +46,15 @@ type KeyRange struct { // TODO(sumeer): merge with ReplicaEngineDataIterator. We can use an EngineIterator // for MVCC key ranges and convert from EngineKey to MVCCKey. type ReplicaMVCCDataIterator struct { + ReplicaDataIteratorOptions + reader storage.Reader curIndex int ranges []KeyRange // When it is non-nil, it represents the iterator for curIndex. // A non-nil it is valid, else it is either done, or err != nil. - it storage.MVCCIterator - err error - reverse bool + it storage.MVCCIterator + err error } // ReplicaEngineDataIterator provides a complete iteration over all data in a @@ -208,9 +219,9 @@ func MakeUserKeyRange(d *roachpb.RangeDescriptor) KeyRange { // replica. It iterates over the replicated key ranges excluding the lock // table key range. Separated locks are made to appear as interleaved. The // iterator can do one of reverse or forward iteration, based on whether -// seekEnd is true or false, respectively. With reverse iteration, it is -// initially positioned at the end of the last range, else it is initially -// positioned at the start of the first range. +// Reverse is true or false in ReplicaDataIteratorOptions, respectively. +// With reverse iteration, it is initially positioned at the end of the last +// range, else it is initially positioned at the start of the first range. // // The iterator requires the reader.ConsistentIterators is true, since it // creates a different iterator for each replicated key range. This is because @@ -220,17 +231,17 @@ func MakeUserKeyRange(d *roachpb.RangeDescriptor) KeyRange { // TODO(erikgrinaker): ReplicaMVCCDataIterator does not support MVCC range keys. // This should be deprecated in favor of e.g. ReplicaEngineDataIterator. func NewReplicaMVCCDataIterator( - d *roachpb.RangeDescriptor, reader storage.Reader, seekEnd bool, + d *roachpb.RangeDescriptor, reader storage.Reader, opts ReplicaDataIteratorOptions, ) *ReplicaMVCCDataIterator { if !reader.ConsistentIterators() { panic("ReplicaMVCCDataIterator needs a Reader that provides ConsistentIterators") } ri := &ReplicaMVCCDataIterator{ - reader: reader, - ranges: MakeReplicatedKeyRangesExceptLockTable(d), - reverse: seekEnd, + ReplicaDataIteratorOptions: opts, + reader: reader, + ranges: MakeReplicatedKeyRangesExceptLockTable(d), } - if ri.reverse { + if ri.Reverse { ri.curIndex = len(ri.ranges) - 1 } else { ri.curIndex = 0 @@ -249,13 +260,13 @@ func (ri *ReplicaMVCCDataIterator) tryCloseAndCreateIter() { return } ri.it = ri.reader.NewMVCCIterator( - storage.MVCCKeyAndIntentsIterKind, + ri.IterKind, storage.IterOptions{ LowerBound: ri.ranges[ri.curIndex].Start, UpperBound: ri.ranges[ri.curIndex].End, - KeyTypes: storage.IterKeyTypePointsAndRanges, + KeyTypes: ri.KeyTypes, }) - if ri.reverse { + if ri.Reverse { ri.it.SeekLT(storage.MakeMVCCMetadataKey(ri.ranges[ri.curIndex].End)) } else { ri.it.SeekGE(storage.MakeMVCCMetadataKey(ri.ranges[ri.curIndex].Start)) @@ -264,7 +275,7 @@ func (ri *ReplicaMVCCDataIterator) tryCloseAndCreateIter() { ri.err = err return } - if ri.reverse { + if ri.Reverse { ri.curIndex-- } else { ri.curIndex++ @@ -282,7 +293,7 @@ func (ri *ReplicaMVCCDataIterator) Close() { // Next advances to the next key in the iteration. func (ri *ReplicaMVCCDataIterator) Next() { - if ri.reverse { + if ri.Reverse { panic("Next called on reverse iterator") } ri.it.Next() @@ -299,7 +310,7 @@ func (ri *ReplicaMVCCDataIterator) Next() { // Prev advances the iterator one key backwards. func (ri *ReplicaMVCCDataIterator) Prev() { - if !ri.reverse { + if !ri.Reverse { panic("Prev called on forward iterator") } ri.it.Prev() diff --git a/pkg/kv/kvserver/rditer/replica_data_iter_test.go b/pkg/kv/kvserver/rditer/replica_data_iter_test.go index 5f5c3f266439..bfea55947089 100644 --- a/pkg/kv/kvserver/rditer/replica_data_iter_test.go +++ b/pkg/kv/kvserver/rditer/replica_data_iter_test.go @@ -148,7 +148,11 @@ func verifyRDReplicatedOnlyMVCCIter( }, hlc.Timestamp{WallTime: 42}) readWriter = spanset.NewReadWriterAt(readWriter, &spans, hlc.Timestamp{WallTime: 42}) } - iter := NewReplicaMVCCDataIterator(desc, readWriter, reverse /* seekEnd */) + iter := NewReplicaMVCCDataIterator(desc, readWriter, ReplicaDataIteratorOptions{ + Reverse: reverse, + IterKind: storage.MVCCKeyAndIntentsIterKind, + KeyTypes: storage.IterKeyTypePointsAndRanges, + }) defer iter.Close() next := iter.Next if reverse { diff --git a/pkg/roachpb/api.proto b/pkg/roachpb/api.proto index 32767f65a3dd..15558ade9add 100644 --- a/pkg/roachpb/api.proto +++ b/pkg/roachpb/api.proto @@ -936,19 +936,18 @@ message HeartbeatTxnResponse { message GCRequest { RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; - // Point keys message GCKey { bytes key = 1 [(gogoproto.casttype) = "Key"]; util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; } repeated GCKey keys = 3 [(gogoproto.nullable) = false]; - // Range keys + message GCRangeKey { - bytes startKey = 1 [(gogoproto.casttype) = "Key"]; - bytes endKey = 2 [(gogoproto.casttype) = "Key"]; + bytes start_key = 1 [(gogoproto.casttype) = "Key"]; + bytes end_key = 2 [(gogoproto.casttype) = "Key"]; util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false]; } - repeated GCRangeKey rangeKeys = 6 [(gogoproto.nullable) = false]; + repeated GCRangeKey range_keys = 6 [(gogoproto.nullable) = false]; // Threshold is the expiration timestamp. util.hlc.Timestamp threshold = 4 [(gogoproto.nullable) = false]; diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 756ef7119212..fb6d70c1afa3 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -4296,12 +4296,14 @@ func MVCCGarbageCollect( if ok, err := iter.Valid(); err != nil { return err } else if ok { - // Use the previous version's timestamp if it's for this key. - if iter.UnsafeKey().Key.Equal(gcKey.Key) { - prevNanos = iter.UnsafeKey().Timestamp.WallTime + if hasPoint, _ := iter.HasPointAndRange(); hasPoint { + // Use the previous version's timestamp if it's for this key. + if iter.UnsafeKey().Key.Equal(gcKey.Key) { + prevNanos = iter.UnsafeKey().Timestamp.WallTime + } + // Seek to the first version for deletion. + iter.Next() } - // Seek to the first version for deletion. - iter.Next() } } } @@ -4392,6 +4394,325 @@ func MVCCGarbageCollect( return nil } +// tsVal encapsulate info about previous range key stack with actual keys +// stripped for efficiency. +type tsVal struct { + ts hlc.Timestamp + val []byte +} + +// rangeKeyMergeTracker is tracking potential merges of range key fragments +// when some of the versions are removed. Tracker stores minimal information +// internally that is required to decide if range key fragments will merge. +type rangeKeyMergeTracker struct { + prevEndKey roachpb.Key + prevTsValues []tsVal + ms *enginepb.MVCCStats +} + +// update updates MVCCStats if previous range key fragments are adjacent to +// current ones and remaining ones in current stack of fragments (indicated by +// removed flag), match exactly with the previous stack (by their timestamps +// and values). +func (t *rangeKeyMergeTracker) update( + startKey, endKey roachpb.Key, unsafeRangeKeys []MVCCRangeKeyValue, +) { + if t.ms == nil { + return + } + if keyCount := len(unsafeRangeKeys); keyCount > 0 && keyCount == len(t.prevTsValues) && t.prevEndKey.Equal(startKey) { + matching := true + for i, pts := range t.prevTsValues { + // We compare bytes directly since we are not interested in value + // equivalence, but identical values for merging keys. e.g. different + // value headers should not merge. + if rkv := unsafeRangeKeys[i]; !pts.ts.Equal(rkv.RangeKey.Timestamp) || !bytes.Equal(pts.val, rkv.Value) { + matching = false + break + } + } + if matching { + // All timestamps in range tombstone history matched with remaining + // timestamp in current history. Range tombstones would merge. + t.ms.Add(updateStatsOnRangeTombstoneMerge(unsafeRangeKeys)) + } + } + t.prevTsValues = t.prevTsValues[:0] + for _, rk := range unsafeRangeKeys { + t.prevTsValues = append(t.prevTsValues, tsVal{ + ts: rk.RangeKey.Timestamp, + val: append([]byte(nil), rk.Value...), + }) + } + t.prevEndKey = endKey.Clone() +} + +// CollectableGCRangeKey is a struct containing range key as well as span +// boundaries locked for particular range key. +// Range GC needs a latch span as it needs to expand iteration beyond the +// range key itself to find adjacent ranges and those ranges should be safe to +// read. +type CollectableGCRangeKey struct { + MVCCRangeKey + LatchSpan roachpb.Span +} + +// MVCCGarbageCollectRangeKeys is similar in functionality to MVCCGarbageCollect but +// operates on range keys. It does sanity checks that no values exist below +// range tombstones so that no values are exposed in case point values GC was +// not performed correctly by the level above. +func MVCCGarbageCollectRangeKeys( + ctx context.Context, rw ReadWriter, ms *enginepb.MVCCStats, rks []CollectableGCRangeKey, +) error { + + var count int64 + defer func(begin time.Time) { + // TODO(oleg): this could be misleading if GC fails, but this function still + // reports how many keys were GC'd. The approach is identical to what point + // key GC does for consistency, but both places could be improved. + log.Eventf(ctx, + "done with GC evaluation for %d range keys at %.2f keys/sec. Deleted %d entries", + len(rks), float64(len(rks))*1e9/float64(timeutil.Since(begin)), count) + }(timeutil.Now()) + + if len(rks) == 0 { + return nil + } + + // Validate range keys are well formed. + for _, rk := range rks { + if err := rk.Validate(); err != nil { + return errors.Wrap(err, "failed to validate gc range keys in mvcc gc") + } + } + + sort.Slice(rks, func(i, j int) bool { + return rks[i].Compare(rks[j].MVCCRangeKey) < 0 + }) + + // Validate that keys are non-overlapping. + for i := 1; i < len(rks); i++ { + if rks[i].StartKey.Compare(rks[i-1].EndKey) < 0 { + return errors.Errorf("range keys in gc request should be non-overlapping: %s vs %s", + rks[i-1].String(), rks[i].String()) + } + } + + var iter MVCCIterator + var ptIter *MVCCIncrementalIterator + + defer func() { + if iter != nil { + iter.Close() + } + if ptIter != nil { + ptIter.Close() + } + }() + + for _, gcKey := range rks { + mergeTracker := rangeKeyMergeTracker{ms: ms} + + // Bound the iterator appropriately for the set of keys we'll be garbage + // collecting. We are using latch bounds to collect info about adjacent + // range fragments for correct MVCCStats updates. + iter = rw.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + LowerBound: gcKey.LatchSpan.Key, + UpperBound: gcKey.LatchSpan.EndKey, + KeyTypes: IterKeyTypeRangesOnly, + }) + + iter.SeekGE(MVCCKey{Key: gcKey.LatchSpan.Key}) + + for ; ; iter.Next() { + if ok, err := iter.Valid(); err != nil { + return err + } else if !ok { + break + } + + bounds := iter.RangeBounds() + unsafeRangeKeys := iter.RangeKeys() + + // Check if preceding range tombstone is adjacent to GC'd one. If we + // started iterating too early, just skip to next key. If boundaries + // match, then we capture start of the range and timestamps for later. + switch bounds.EndKey.Compare(gcKey.StartKey) { + case -1: + continue + case 0: + mergeTracker.update(bounds.Key, bounds.EndKey, unsafeRangeKeys) + continue + } + + // Terminate loop once we've reached a range tombstone past the right + // GC range key boundary. + if cmp := bounds.Key.Compare(gcKey.EndKey); cmp >= 0 { + mergeTracker.update(bounds.Key, bounds.EndKey, unsafeRangeKeys) + break + } + + // Check if we have a partial overlap between range tombstone and + // requested GCd range. This shouldn't happen in most cases, but we can + // have a range merge between GC run and cmd_gc execution or + // erroneous GC request and mvcc stats should be updated correctly. + // In those cases gcKey boundaries will not match underlying range + // tombstone boundaries and need to be adjusted. + trimLeft, trimRight := false, false + gcedRangeStartKey := bounds.Key + if gcKey.StartKey.Compare(gcedRangeStartKey) > 0 { + gcedRangeStartKey = gcKey.StartKey + trimLeft = true + } + gcedRangeEndKey := bounds.EndKey + if gcKey.EndKey.Compare(gcedRangeEndKey) < 0 { + gcedRangeEndKey = gcKey.EndKey + trimRight = true + } + + gcedRange := MVCCRangeKeyValue{ + RangeKey: MVCCRangeKey{ + StartKey: gcedRangeStartKey, + EndKey: gcedRangeEndKey, + }, + } + remaining := len(unsafeRangeKeys) + for i, rkv := range unsafeRangeKeys { + gcedRange.RangeKey.Timestamp = rkv.RangeKey.Timestamp + gcedRange.Value = rkv.Value + remove := rkv.RangeKey.Timestamp.LessEq(gcKey.Timestamp) + if remove { + if err := rw.ClearMVCCRangeKey(gcedRange.RangeKey); err != nil { + return err + } + remaining-- + count++ + } + if ms != nil { + topRangeKey := i == 0 + if remove { + ms.Add(updateStatsOnRangeTombstoneGC(gcedRange, topRangeKey)) + } + if trimLeft { + ms.Add(updateStatsOnRangeTombstoneSplit(gcedRange, gcedRange.RangeKey.StartKey, topRangeKey)) + } + if trimRight { + ms.Add(updateStatsOnRangeTombstoneSplit(gcedRange, gcedRange.RangeKey.EndKey, topRangeKey)) + } + } + } + + mergeTracker.update(gcedRangeStartKey, gcedRangeEndKey, unsafeRangeKeys[0:remaining]) + + // If we didn't find any removable fragments, shortcut without checking + // underlying keys. + if remaining == len(unsafeRangeKeys) { + continue + } + + // Verify that there are no remaining data under the deleted range using + // time bound iterator. + ptIter = NewMVCCIncrementalIterator(rw, MVCCIncrementalIterOptions{ + KeyTypes: IterKeyTypePointsOnly, + StartKey: gcedRangeStartKey, + EndKey: gcedRangeEndKey, + EndTime: gcKey.Timestamp, + IntentPolicy: MVCCIncrementalIterIntentPolicyEmit, + }) + ptIter.SeekGE(MVCCKey{Key: gcedRangeStartKey}) + for ; ; ptIter.Next() { + if ok, err := ptIter.Valid(); err != nil { + return err + } else if !ok { + break + } + // Disallow any value under the range key. We only skip intents as they + // must have a provisional value with appropriate timestamp. + if pointKey := ptIter.UnsafeKey(); pointKey.IsValue() { + return errors.Errorf("attempt to delete range tombstone %q hiding key at %q", + gcKey, pointKey) + } + } + ptIter.Close() + ptIter = nil + } + + iter.Close() + iter = nil + } + + return nil +} + +// updateStatsOnRangeTombstoneGC updates stats for removed range keys. +// If all range keys are removed for the interval then this method should be +// called once with removeKeys == true. This would ensure we account for keys +// being removed. Then it should be called without removeKeys for all subsequent +// range keys in stack to account for remaining timestamps and values. +func updateStatsOnRangeTombstoneGC( + removedRange MVCCRangeKeyValue, removeKeys bool, +) (ms enginepb.MVCCStats) { + ms.AgeTo(removedRange.RangeKey.Timestamp.WallTime) + + if removeKeys { + leftKeySize := int64(EncodedMVCCKeyPrefixLength(removedRange.RangeKey.StartKey)) + rightKeySize := int64(EncodedMVCCKeyPrefixLength(removedRange.RangeKey.EndKey)) + ms.RangeKeyBytes -= leftKeySize + rightKeySize + ms.RangeKeyCount-- + } + tsSize := int64(EncodedMVCCTimestampSuffixLength(removedRange.RangeKey.Timestamp)) + valueSize := int64(len(removedRange.Value)) + ms.RangeKeyBytes -= tsSize + ms.RangeValBytes -= valueSize + ms.RangeValCount-- + return ms +} + +// updateStatsOnRangeTombstoneSplit updates stats for ranges that are being +// split during GC operation. +// If splitKeys is true, this method will account for the key sizes added by +// split. If false then only timestamp and value duplication is added. +// When splitting range, this method should be called once with splitKeys +// when processing top range key and then with false for remaining range keys. +func updateStatsOnRangeTombstoneSplit( + rangeKey MVCCRangeKeyValue, splitKey roachpb.Key, splitKeys bool, +) (ms enginepb.MVCCStats) { + ms.AgeTo(rangeKey.RangeKey.Timestamp.WallTime) + + // Mind that we add key contributions twice since they are added as the + // end key and start key for ranges. If trimmed range is removed at current + // timestamp extra value will be removed above when handling the deletion. + if splitKeys { + keySize := int64(EncodedMVCCKeyPrefixLength(splitKey)) + ms.RangeKeyBytes += keySize * 2 + ms.RangeKeyCount++ + } + + tsSize := int64(EncodedMVCCTimestampSuffixLength(rangeKey.RangeKey.Timestamp)) + valueSize := int64(len(rangeKey.Value)) + ms.RangeKeyBytes += tsSize + ms.RangeValBytes += valueSize + ms.RangeValCount++ + return ms +} + +// updateStatsOnRangeTombstoneMerge updates MVCCStats for the case where all +// range tombstone fragments merge to the left. i.e. start key is eliminated +// twice at the top of history and all versions are removed as a timestamp and +// a value. +func updateStatsOnRangeTombstoneMerge(rangeKeys []MVCCRangeKeyValue) (ms enginepb.MVCCStats) { + ms.AgeTo(rangeKeys[0].RangeKey.Timestamp.WallTime) + ms.RangeKeyBytes -= int64(EncodedMVCCKeyPrefixLength(rangeKeys[0].RangeKey.StartKey)) * 2 + ms.RangeKeyCount-- + for _, rk := range rangeKeys { + ms.AgeTo(rk.RangeKey.Timestamp.WallTime) + ms.RangeKeyBytes -= int64(EncodedMVCCTimestampSuffixLength(rk.RangeKey.Timestamp)) + ms.RangeValCount-- + ms.RangeValBytes -= int64(len(rk.Value)) + } + return ms +} + // MVCCFindSplitKey finds a key from the given span such that the left side of // the split is roughly targetSize bytes. The returned key will never be chosen // from the key ranges listed in keys.NoSplitSpans. diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 6c958bb1f7fd..59900718a7f7 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -4834,11 +4834,11 @@ func TestMVCCGarbageCollect(t *testing.T) { t.Fatal(err) } if log.V(1) { + log.Info(context.Background(), "Engine content before GC") kvsn, err := Scan(engine, localMax, keyMax, 0) if err != nil { t.Fatal(err) } - log.Info(context.Background(), "before") for i, kv := range kvsn { log.Infof(context.Background(), "%d: %s", i, kv.Key) } @@ -4879,11 +4879,11 @@ func TestMVCCGarbageCollect(t *testing.T) { } if log.V(1) { + log.Info(context.Background(), "Engine content after GC") kvsn, err := Scan(engine, localMax, keyMax, 0) if err != nil { t.Fatal(err) } - log.Info(context.Background(), "after") for i, kv := range kvsn { log.Infof(context.Background(), "%d: %s", i, kv.Key) } @@ -5236,6 +5236,692 @@ func TestMVCCGarbageCollectUsesSeekLTAppropriately(t *testing.T) { } } +type rangeTestDataItem struct { + point MVCCKeyValue + txn *roachpb.Transaction + rangeTombstone MVCCRangeKey +} + +type rangeTestData []rangeTestDataItem + +func (d rangeTestData) populateEngine( + t *testing.T, engine ReadWriter, ms *enginepb.MVCCStats, +) hlc.Timestamp { + ctx := context.Background() + var ts hlc.Timestamp + for _, v := range d { + if v.rangeTombstone.Timestamp.IsEmpty() { + if v.point.Value != nil { + require.NoError(t, MVCCPut(ctx, engine, ms, v.point.Key.Key, v.point.Key.Timestamp, + hlc.ClockTimestamp{}, roachpb.MakeValueFromBytes(v.point.Value), v.txn), + "failed to insert test value into engine (%s)", v.point.Key.String()) + } else { + require.NoError(t, MVCCDelete(ctx, engine, ms, v.point.Key.Key, v.point.Key.Timestamp, + hlc.ClockTimestamp{}, v.txn), + "failed to insert tombstone value into engine (%s)", v.point.Key.String()) + } + ts = v.point.Key.Timestamp + } else { + require.NoError(t, MVCCDeleteRangeUsingTombstone(ctx, engine, ms, v.rangeTombstone.StartKey, + v.rangeTombstone.EndKey, v.rangeTombstone.Timestamp, hlc.ClockTimestamp{}, nil, nil, 0), + "failed to insert range tombstone into engine (%s)", v.rangeTombstone.String()) + ts = v.rangeTombstone.Timestamp + } + } + return ts +} + +// pt creates a point update for key with default value. +func pt(key roachpb.Key, ts hlc.Timestamp) rangeTestDataItem { + val := roachpb.MakeValueFromString("testval").RawBytes + return rangeTestDataItem{point: MVCCKeyValue{Key: mvccVersionKey(key, ts), Value: val}} +} + +// txn wraps point update and adds transaction to it for intent creation. +func txn(d rangeTestDataItem) rangeTestDataItem { + ts := d.point.Key.Timestamp + d.txn = &roachpb.Transaction{ + Status: roachpb.PENDING, + ReadTimestamp: ts, + GlobalUncertaintyLimit: ts.Next().Next(), + } + d.txn.ID = uuid.MakeV4() + d.txn.WriteTimestamp = ts + d.txn.Key = roachpb.Key([]byte{0, 1}) + return d +} + +// rng creates range tombstone update. +func rng(start, end roachpb.Key, ts hlc.Timestamp) rangeTestDataItem { + return rangeTestDataItem{rangeTombstone: MVCCRangeKey{StartKey: start, EndKey: end, Timestamp: ts}} +} + +func TestMVCCGarbageCollectRanges(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + ctx := context.Background() + + mkKey := func(k string) roachpb.Key { + return append(keys.SystemSQLCodec.TablePrefix(42), k...) + } + rangeStart := mkKey("") + rangeEnd := rangeStart.PrefixEnd() + + // Note we use keys of different lengths so that stats accounting errors + // would not obviously cancel out if right and left bounds are used + // incorrectly. + keyA := mkKey("a") + keyB := mkKey("bb") + keyC := mkKey("ccc") + keyD := mkKey("dddd") + keyE := mkKey("eeeee") + keyF := mkKey("ffffff") + + mkTs := func(wallTimeSec int64) hlc.Timestamp { + return hlc.Timestamp{WallTime: time.Second.Nanoseconds() * wallTimeSec} + } + + ts1 := mkTs(1) + ts2 := mkTs(2) + ts3 := mkTs(3) + ts4 := mkTs(4) + tsMax := mkTs(9) + + testData := []struct { + name string + // Note that range test data should be in ascending order (valid writes). + before rangeTestData + request []roachpb.GCRequest_GCRangeKey + // Note that expectations should be in timestamp descending order + // (forward iteration). + after []MVCCRangeKey + // Optional start and end range for tests that want to restrict default + // key range. + rangeStart roachpb.Key + rangeEnd roachpb.Key + }{ + { + name: "signle range", + before: rangeTestData{ + rng(keyA, keyD, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts3}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "multiple contiguous fragments", + before: rangeTestData{ + rng(keyA, keyD, ts2), + rng(keyB, keyC, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts4}, + }, + }, + { + name: "multiple non-contiguous fragments", + before: rangeTestData{ + rng(keyA, keyB, ts2), + rng(keyC, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "multiple non-overlapping fragments", + before: rangeTestData{ + rng(keyA, keyB, ts2), + rng(keyC, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + {StartKey: keyC, EndKey: keyD, Timestamp: ts2}, + }, + }, + { + name: "overlapping [A--[B--B]--A]", + before: rangeTestData{ + rng(keyB, keyC, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "overlapping [A--[B--A]--B]", + before: rangeTestData{ + rng(keyB, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyC, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyC, EndKey: keyD, Timestamp: ts2}, + }, + }, + { + name: "overlapping [B--[A--B]--A]", + before: rangeTestData{ + rng(keyA, keyC, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + }, + }, + { + name: "overlapping [B--[A--A]--B]", + before: rangeTestData{ + rng(keyA, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + {StartKey: keyC, EndKey: keyD, Timestamp: ts2}, + }, + }, + { + name: "overlapping [[AB--A]--B]", + before: rangeTestData{ + rng(keyA, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyB, EndKey: keyD, Timestamp: ts2}, + }, + }, + { + name: "overlapping [[AB--B]--A]", + before: rangeTestData{ + rng(keyA, keyB, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "overlapping [B--[A--AB]]", + before: rangeTestData{ + rng(keyA, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + }, + }, + { + name: "overlapping [A--[B--AB]]", + before: rangeTestData{ + rng(keyB, keyD, ts2), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "overlapping [B--[A--AB]] point before", + before: rangeTestData{ + rng(keyB, keyD, ts2), + pt(keyA, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyC, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts2}, + }, + }, + { + name: "overlapping [B--[A--AB]] point at range start", + before: rangeTestData{ + rng(keyA, keyD, ts2), + pt(keyA, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + }, + }, + { + name: "overlapping [B--[A--AB]] point between", + before: rangeTestData{ + rng(keyA, keyD, ts2), + pt(keyB, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyC, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyC, Timestamp: ts2}, + }, + }, + { + name: "overlapping [B--[A--AB]] point at gc start", + before: rangeTestData{ + rng(keyA, keyD, ts2), + pt(keyB, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts2}, + }, + }, + { + name: "overlapping [A--[B--AB]] point before", + before: rangeTestData{ + rng(keyC, keyD, ts2), + pt(keyA, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "overlapping [A--[B--AB]] point at gc start", + before: rangeTestData{ + rng(keyB, keyD, ts2), + pt(keyA, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "overlapping [A--[B--AB]] point between", + before: rangeTestData{ + rng(keyC, keyD, ts2), + pt(keyB, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "overlapping [A--[B--AB]] point at range start", + before: rangeTestData{ + rng(keyB, keyD, ts2), + pt(keyB, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "range under intent", + before: rangeTestData{ + rng(keyA, keyD, ts2), + txn(pt(keyA, ts4)), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "stacked range fragments", + before: rangeTestData{ + rng(keyB, keyC, ts2), + rng(keyA, keyD, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts4}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "old value before range", + before: rangeTestData{ + pt(keyA, ts2), + rng(keyB, keyC, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts3}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "old value at range end", + before: rangeTestData{ + pt(keyC, ts2), + rng(keyB, keyC, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts3}, + }, + after: []MVCCRangeKey{}, + }, + { + name: "range partially overlap gc request", + before: rangeTestData{ + rng(keyA, keyD, ts1), + rng(keyA, keyD, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts1}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts3}, + {StartKey: keyA, EndKey: keyB, Timestamp: ts1}, + {StartKey: keyB, EndKey: keyC, Timestamp: ts3}, + {StartKey: keyC, EndKey: keyD, Timestamp: ts3}, + {StartKey: keyC, EndKey: keyD, Timestamp: ts1}, + }, + }, + { + name: "range merges sides", + before: rangeTestData{ + rng(keyB, keyC, ts1), + rng(keyA, keyD, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts1}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts3}, + }, + }, + { + name: "range merges next", + before: rangeTestData{ + rng(keyB, keyC, ts1), + rng(keyA, keyC, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts1}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyC, Timestamp: ts3}, + }, + }, + { + name: "range merges previous", + before: rangeTestData{ + rng(keyA, keyB, ts1), + rng(keyA, keyD, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyB, Timestamp: ts1}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts3}, + }, + }, + { + name: "range merges chain", + before: rangeTestData{ + rng(keyB, keyC, ts1), + rng(keyD, keyE, ts2), + rng(keyA, keyF, ts3), + rng(keyA, keyF, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts1}, + {StartKey: keyD, EndKey: keyE, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyF, Timestamp: ts4}, + {StartKey: keyA, EndKey: keyF, Timestamp: ts3}, + }, + }, + { + name: "range merges sequential", + before: rangeTestData{ + rng(keyC, keyD, ts1), + rng(keyB, keyD, ts2), + rng(keyA, keyE, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts2}, + {StartKey: keyC, EndKey: keyD, Timestamp: ts2}, + }, + after: []MVCCRangeKey{ + {StartKey: keyA, EndKey: keyE, Timestamp: ts3}, + }, + }, + { + name: "don't merge outside range", + before: rangeTestData{ + rng(keyB, keyC, ts1), + // Tombstone spanning multiple ranges. + rng(keyA, keyD, ts4), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyB, EndKey: keyC, Timestamp: ts1}, + }, + after: []MVCCRangeKey{ + // We only iterate data within range, so range keys would be + // truncated. + {StartKey: keyB, EndKey: keyC, Timestamp: ts4}, + }, + rangeStart: keyB, + rangeEnd: keyC, + }, + } + + for _, engineImpl := range mvccEngineImpls { + t.Run(engineImpl.name, func(t *testing.T) { + for _, d := range testData { + t.Run(d.name, func(t *testing.T) { + engine := engineImpl.create() + defer engine.Close() + + // Populate range descriptor defaults. + if len(d.rangeStart) == 0 { + d.rangeStart = rangeStart + } + if len(d.rangeEnd) == 0 { + d.rangeEnd = rangeEnd + } + + var ms enginepb.MVCCStats + d.before.populateEngine(t, engine, &ms) + + rangeKeys := rangesFromRequests(rangeStart, rangeEnd, d.request) + require.NoError(t, MVCCGarbageCollectRangeKeys(ctx, engine, &ms, rangeKeys), + "failed to run mvcc range tombstone garbage collect") + + it := engine.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypeRangesOnly, + LowerBound: d.rangeStart, + UpperBound: d.rangeEnd, + }) + defer it.Close() + it.SeekGE(MVCCKey{Key: d.rangeStart}) + expectIndex := 0 + for ; ; it.Next() { + ok, err := it.Valid() + require.NoError(t, err, "failed to iterate engine") + if !ok { + break + } + for _, rkv := range it.RangeKeys() { + require.Less(t, expectIndex, len(d.after), "not enough expectations; at unexpected range:", rkv.RangeKey.String()) + require.EqualValues(t, d.after[expectIndex], rkv.RangeKey, "range key is not equal") + expectIndex++ + } + } + require.Equal(t, len(d.after), expectIndex, + "not all range tombstone expectations were consumed") + + ms.AgeTo(tsMax.WallTime) + it = engine.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: d.rangeStart, + UpperBound: d.rangeEnd, + }) + expMs, err := ComputeStatsForRange(it, rangeStart, rangeEnd, tsMax.WallTime) + require.NoError(t, err, "failed to compute stats for range") + require.EqualValues(t, expMs, ms, "computed range stats vs gc'd") + }) + } + }) + } +} + +func rangesFromRequests( + rangeStart, rangeEnd roachpb.Key, rangeKeys []roachpb.GCRequest_GCRangeKey, +) []CollectableGCRangeKey { + collectableKeys := make([]CollectableGCRangeKey, len(rangeKeys)) + for i, rk := range rangeKeys { + leftPeekBound := rk.StartKey.Prevish(roachpb.PrevishKeyLength) + if leftPeekBound.Compare(rangeStart) <= 0 { + leftPeekBound = rangeStart + } + rightPeekBound := rk.EndKey.Next() + if rightPeekBound.Compare(rangeEnd) >= 0 { + rightPeekBound = rangeEnd + } + collectableKeys[i] = CollectableGCRangeKey{ + MVCCRangeKey: MVCCRangeKey{ + StartKey: rk.StartKey, + EndKey: rk.EndKey, + Timestamp: rk.Timestamp, + }, + LatchSpan: roachpb.Span{Key: leftPeekBound, EndKey: rightPeekBound}, + } + } + return collectableKeys +} + +func TestMVCCGarbageCollectRangesFailures(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + mkKey := func(k string) roachpb.Key { + return append(keys.SystemSQLCodec.TablePrefix(42), k...) + } + rangeStart := mkKey("") + rangeEnd := rangeStart.PrefixEnd() + + keyA := mkKey("a") + keyB := mkKey("b") + keyC := mkKey("c") + keyD := mkKey("d") + + mkTs := func(wallTimeSec int64) hlc.Timestamp { + return hlc.Timestamp{WallTime: time.Second.Nanoseconds() * wallTimeSec} + } + + ts1 := mkTs(1) + ts2 := mkTs(2) + ts3 := mkTs(3) + ts4 := mkTs(4) + ts5 := mkTs(5) + ts6 := mkTs(6) + ts7 := mkTs(7) + ts8 := mkTs(8) + + testData := []struct { + name string + before rangeTestData + request []roachpb.GCRequest_GCRangeKey + error string + }{ + { + name: "request overlap", + before: rangeTestData{ + rng(keyA, keyD, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyC, Timestamp: ts3}, + {StartKey: keyB, EndKey: keyD, Timestamp: ts3}, + }, + error: "range keys in gc request should be non-overlapping", + }, + { + name: "delete range above value", + before: rangeTestData{ + pt(keyB, ts2), + rng(keyA, keyD, ts3), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts3}, + }, + error: "attempt to delete range tombstone .* hiding key at .*", + }, + { + // Note that this test is a bit contrived as we can't put intent + // under the range tombstone, but we test that if you try to delete + // tombstone above intents even if it doesn't exist, we would reject + // the attempt as it is an indication of inconsistency. + // This might be relaxed to ignore any points which are not covered. + name: "delete range above intent", + before: rangeTestData{ + rng(keyA, keyD, ts2), + txn(pt(keyB, ts3)), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts4}, + }, + error: "attempt to delete range tombstone .* hiding key at .*", + }, + { + name: "delete range above tail of long history", + before: rangeTestData{ + pt(keyB, ts1), + rng(keyA, keyD, ts2), + pt(keyB, ts3), + pt(keyB, ts4), + pt(keyB, ts5), + pt(keyB, ts6), + pt(keyB, ts7), + pt(keyB, ts8), + }, + request: []roachpb.GCRequest_GCRangeKey{ + {StartKey: keyA, EndKey: keyD, Timestamp: ts2}, + }, + error: "attempt to delete range tombstone .* hiding key at .*", + }, + } + + ctx := context.Background() + for _, engineImpl := range mvccEngineImpls { + t.Run(engineImpl.name, func(t *testing.T) { + for _, d := range testData { + t.Run(d.name, func(t *testing.T) { + engine := engineImpl.create() + defer engine.Close() + d.before.populateEngine(t, engine, nil) + rangeKeys := rangesFromRequests(rangeStart, rangeEnd, d.request) + err := MVCCGarbageCollectRangeKeys(ctx, engine, nil, rangeKeys) + require.Errorf(t, err, "expected error '%s' but found none", d.error) + require.True(t, testutils.IsError(err, d.error), + "expected error '%s' found '%s'", d.error, err) + }) + } + }) + } +} + // TestResolveIntentWithLowerEpoch verifies that trying to resolve // an intent at an epoch that is lower than the epoch of the intent // leaves the intent untouched. diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 30b8e9278272..2698dbd2abcf 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -853,6 +853,10 @@ var charts = []sectionDescription{ Title: "Keys with GC'able Data", Metrics: []string{"queue.gc.info.numkeysaffected"}, }, + { + Title: "Range Keys with GC'able Data", + Metrics: []string{"queue.gc.info.numrangekeysaffected"}, + }, { Title: "Old Intents", Metrics: []string{"queue.gc.info.intentsconsidered"},