From 857fa5c92b7c10a480d46b742cbb5bc7fcc7d540 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 6 Feb 2022 22:22:00 +0000 Subject: [PATCH] storage: add experimental MVCC range tombstone primitives This patch adds initial experimental primitives for MVCC range tombstones and the range keys they build on, based on experimental Pebble range keys, * Data structures: * `storage.MVCCRangeKey` * `nil` value for range tombstones (as for point tombstones). * Engine methods for writing, removing, and reading range keys: * `Engine.ExperimentalClearMVCCRangeKey()` * `Engine.ExperimentalPutMVCCRangeKey()` * `SimpleMVCCIterator.HasPointAndRange()` * `SimpleMVCCIterator.RangeBounds()` * `SimpleMVCCIterator.RangeKeys()` * MVCC functions and iterator for writing and reading range tombstones: * `storage.ExperimentalMVCCDeleteRangeUsingTombstone()` * `storage.ScanMVCCTombstones()` * `storage.MVCCRangeTombstoneIterator` Range tombstones do not have a distinct identity, and should instead be considered a tombstone continuum: they will merge with abutting tombstones, can be partially cleared, can split or merge along with ranges, and so on. Bounded scans will truncate them to the scan bounds. Raw fragmented range keys can be accessed via the `Engine` and `SimpleMVCCIterator` interfaces, but this is primarily for internal MVCC usage. In the externally facing MVCC API, range tombstones will be exposed primarily in defragmented form. Range tombstones are not yet handled in the rest of the MVCC API, nor are they exposed via KV APIs. They are not persisted to disk either, due to Pebble range key limitations. Subsequent pull requests will extend their functionality and integrate them with other components. Release note: None --- pkg/kv/kvserver/rangefeed/task_test.go | 15 + pkg/kv/kvserver/spanset/batch.go | 31 ++ pkg/storage/BUILD.bazel | 2 + pkg/storage/engine.go | 46 +++ pkg/storage/intent_interleaving_iter.go | 15 + pkg/storage/multi_iterator.go | 16 + pkg/storage/mvcc.go | 60 ++++ pkg/storage/mvcc_history_test.go | 95 ++++-- pkg/storage/mvcc_incremental_iterator.go | 15 + pkg/storage/mvcc_key.go | 70 +++++ pkg/storage/mvcc_key_test.go | 91 ++++++ pkg/storage/mvcc_range_tombstone_iterator.go | 289 ++++++++++++++++++ .../mvcc_range_tombstone_iterator_test.go | 170 +++++++++++ pkg/storage/pebble.go | 28 ++ pkg/storage/pebble_batch.go | 18 ++ pkg/storage/pebble_iterator.go | 35 +++ pkg/storage/sst_iterator.go | 15 + pkg/storage/sst_writer.go | 18 ++ .../mvcc_histories/delete_range_tombstone | 111 +++++++ pkg/util/hlc/timestamp.go | 16 + 20 files changed, 1138 insertions(+), 18 deletions(-) create mode 100644 pkg/storage/mvcc_range_tombstone_iterator.go create mode 100644 pkg/storage/mvcc_range_tombstone_iterator_test.go create mode 100644 pkg/storage/testdata/mvcc_histories/delete_range_tombstone diff --git a/pkg/kv/kvserver/rangefeed/task_test.go b/pkg/kv/kvserver/rangefeed/task_test.go index 87b695a7a228..edf833d47904 100644 --- a/pkg/kv/kvserver/rangefeed/task_test.go +++ b/pkg/kv/kvserver/rangefeed/task_test.go @@ -190,6 +190,21 @@ func (s *testIterator) curKV() storage.MVCCKeyValue { return s.kvs[s.cur] } +// HasPointAndRange implements SimpleMVCCIterator. +func (s *testIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (s *testIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeTombstones implements SimpleMVCCIterator. +func (s *testIterator) RangeKeys() []storage.MVCCRangeKeyValue { + panic("not implemented") +} + func TestInitResolvedTSScan(t *testing.T) { defer leaktest.AfterTest(t)() startKey := roachpb.RKey("d") diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index ae012cb87a87..752f465ee0ef 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -176,6 +176,21 @@ func (i *MVCCIterator) UnsafeValue() []byte { return i.i.UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (f *MVCCIterator) RangeKeys() []storage.MVCCRangeKeyValue { + panic("not implemented") +} + // ComputeStats is part of the storage.MVCCIterator interface. func (i *MVCCIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, @@ -599,6 +614,22 @@ func (s spanSetWriter) ClearIterRange(iter storage.MVCCIterator, start, end roac return s.w.ClearIterRange(iter, start, end) } +func (s spanSetWriter) ExperimentalPutMVCCRangeKey( + rangeKey storage.MVCCRangeKey, value []byte, +) error { + if err := s.checkAllowedRange(rangeKey.StartKey, rangeKey.EndKey); err != nil { + return err + } + return s.w.ExperimentalPutMVCCRangeKey(rangeKey, value) +} + +func (s spanSetWriter) ExperimentalClearMVCCRangeKey(rangeKey storage.MVCCRangeKey) error { + if err := s.checkAllowedRange(rangeKey.StartKey, rangeKey.EndKey); err != nil { + return err + } + return s.w.ExperimentalClearMVCCRangeKey(rangeKey) +} + func (s spanSetWriter) Merge(key storage.MVCCKey, value []byte) error { if s.spansOnly { if err := s.spans.CheckAllowed(SpanReadWrite, roachpb.Span{Key: key.Key}); err != nil { diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index 5952a8b79c08..6c3f25181377 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -21,6 +21,7 @@ go_library( "mvcc_incremental_iterator.go", "mvcc_key.go", "mvcc_logical_ops.go", + "mvcc_range_tombstone_iterator.go", "open.go", "pebble.go", "pebble_batch.go", @@ -108,6 +109,7 @@ go_test( "mvcc_incremental_iterator_test.go", "mvcc_key_test.go", "mvcc_logical_ops_test.go", + "mvcc_range_tombstone_iterator_test.go", "mvcc_stats_test.go", "mvcc_test.go", "pebble_file_registry_test.go", diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 5198a553ab36..aec81dc66fb9 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -72,6 +72,14 @@ type SimpleMVCCIterator interface { // UnsafeValue returns the same value as Value, but the memory is // invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. UnsafeValue() []byte + // HasPointAndRange returns whether the current iterator position has a point + // key and/or a range key. + HasPointAndRange() (bool, bool) + // RangeBounds returns the range bounds for the current range key, if any. + RangeBounds() (roachpb.Key, roachpb.Key) + // RangeKeys returns the range key fragments at the current iterator + // posistion. + RangeKeys() []MVCCRangeKeyValue } // IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats. @@ -309,8 +317,23 @@ type IterOptions struct { // use such an iterator is to use it in concert with an iterator without // timestamp hints, as done by MVCCIncrementalIterator. MinTimestampHint, MaxTimestampHint hlc.Timestamp + // KeyTypes specifies the types of keys to surface: point and/or range keys. + // Use HasPointAndRange() to determine which key type is present at a given + // iterator position, and RangeBounds() and RangeKeys() to access range keys. + KeyTypes IterKeyType } +// IterKeyType configures which types of keys an iterator should surface. +// +// TODO(erikgrinaker): Combine this with MVCCIterKind somehow. +type IterKeyType = pebble.IterKeyType + +const ( + IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly + IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges + IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly +) + // MVCCIterKind is used to inform Reader about the kind of iteration desired // by the caller. type MVCCIterKind int @@ -584,6 +607,29 @@ type Writer interface { // returns. ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error + // ExperimentalClearMVCCRangeKey deletes an MVCC range key from start + // (inclusive) to end (exclusive) at the given timestamp. For any range key + // that straddles the start and end boundaries, only the segments within the + // boundaries will be cleared. Clears are idempotent. + // + // This method is primarily intented for MVCC garbage collection and similar + // internal use. It mutates MVCC history, and does not check for intents or + // other conflicts. + // + // This method is EXPERIMENTAL. Range keys are not supported throughout the + // MVCC API, and the on-disk format is unstable. + ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error + + // ExperimentalPutMVCCRangeKey writes a value to an MVCC range key. It is + // primarily used for range tombstones, which have a value of nil. Range keys + // will be non-deterministically fragmented by Pebble, and may be partially + // removed. Range keys can be accessed via MVCCIterator with the appropriate + // settings. + // + // This function is EXPERIMENTAL. Range keys are not handled throughout the + // MVCC API, and the on-disk format is unstable. + ExperimentalPutMVCCRangeKey(MVCCRangeKey, []byte) error + // Merge is a high-performance write operation used for values which are // accumulated over several writes. Multiple values can be merged // sequentially into a single key; a subsequent read will return a "merged" diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index bf31221fbe93..e59407c3a43c 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -715,6 +715,21 @@ func (i *intentInterleavingIter) Value() []byte { return i.iter.Value() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *intentInterleavingIter) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (f *intentInterleavingIter) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + func (i *intentInterleavingIter) Close() { i.iter.Close() i.intentIter.Close() diff --git a/pkg/storage/multi_iterator.go b/pkg/storage/multi_iterator.go index 9838adf60ec7..6d2558e10c21 100644 --- a/pkg/storage/multi_iterator.go +++ b/pkg/storage/multi_iterator.go @@ -14,6 +14,7 @@ import ( "bytes" "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" ) const invalidIdxSentinel = -1 @@ -92,6 +93,21 @@ func (f *multiIterator) UnsafeValue() []byte { return f.iters[f.currentIdx].UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (f *multiIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (f *multiIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (f *multiIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + // Next advances the iterator to the next key/value in the iteration. After this // call, Valid() will be true if the iterator was not positioned at the last // key. diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 64b0d1abaef2..7a46008ac6ea 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -99,6 +99,12 @@ type MVCCKeyValue struct { Value []byte } +// MVCCRangeKeyValue represents a ranged key/value pair. +type MVCCRangeKeyValue struct { + Key MVCCRangeKey + Value []byte +} + // optionalValue represents an optional roachpb.Value. It is preferred // over a *roachpb.Value to avoid the forced heap allocation. type optionalValue struct { @@ -2202,6 +2208,33 @@ func MVCCDeleteRange( return keys, res.ResumeSpan, res.NumKeys, nil } +// ExperimentalMVCCDeleteRangeUsingTombstone deletes the given MVCC keyspan at +// the given timestamp using a range tombstone (rather than point tombstones). +// This operation is non-transactional, but will check for existing intents and +// return a WriteIntentError containing up to maxIntents intents. +// +// This function is EXPERIMENTAL. Range tombstones are not supported throughout +// the MVCC API, and the on-disk format is unstable. +// +// TODO(erikgrinaker): Needs conflict handling, e.g. WriteTooOldError. +// TODO(erikgrinaker): Needs MVCCStats handling. +func ExperimentalMVCCDeleteRangeUsingTombstone( + ctx context.Context, + rw ReadWriter, + ms *enginepb.MVCCStats, + startKey, endKey roachpb.Key, + timestamp hlc.Timestamp, + maxIntents int64, +) error { + if intents, err := ScanIntents(ctx, rw, startKey, endKey, maxIntents, 0); err != nil { + return err + } else if len(intents) > 0 { + return &roachpb.WriteIntentError{Intents: intents} + } + return rw.ExperimentalPutMVCCRangeKey(MVCCRangeKey{ + StartKey: startKey, EndKey: endKey, Timestamp: timestamp}, nil) +} + func recordIteratorStats(traceSpan *tracing.Span, iteratorStats IteratorStats) { stats := iteratorStats.Stats if traceSpan != nil { @@ -3929,3 +3962,30 @@ func ComputeStatsForRange( ms.LastUpdateNanos = nowNanos return ms, nil } + +// MVCCScanRangeTombstones returns a list of range tombstones across the given +// span at the given timestamp, in end,timestamp order rather that +// start,timestamp. Any tombstones that straddle the bounds will be truncated. +func MVCCScanRangeTombstones( + ctx context.Context, reader Reader, start, end roachpb.Key, ts hlc.Timestamp, +) ([]MVCCRangeKey, error) { + var tombstones []MVCCRangeKey + iter := NewMVCCRangeTombstoneIterator(reader, MVCCRangeTombstoneIterOptions{ + LowerBound: start, + UpperBound: end, + MaxTimestamp: ts, + }) + for { + if ok, err := iter.Valid(); err != nil { + return nil, err + } else if !ok { + break + } + if err := ctx.Err(); err != nil { + return nil, err + } + tombstones = append(tombstones, iter.Key()) + iter.Next() + } + return tombstones, nil +} diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index f8fa5f827f6c..e4d6e15827d5 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -37,6 +37,7 @@ import ( "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" "github.com/cockroachdb/redact" + "github.com/stretchr/testify/require" ) // TestMVCCHistories verifies that sequences of MVCC reads and writes @@ -55,17 +56,20 @@ import ( // resolve_intent t= k= [status=] // check_intent k= [none] // -// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] -// del [t=] [ts=[,]] [resolve [status=]] k= -// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] -// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] -// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] -// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] -// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] +// del [t=] [ts=[,]] [resolve [status=]] k= +// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] +// del_range_ts [ts=[,]] k= end= +// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] +// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] +// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] +// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// scan_range_ts k= end= [ts=[,]] // // merge [ts=[,]] k= v= [raw] // // clear_range k= end= +// clear_range_ts k= end= [ts=[,]] // // Where `` can be a simple string, or a string // prefixed by the following characters: @@ -112,8 +116,15 @@ func TestMVCCHistories(t *testing.T) { defer engine.Close() reportDataEntries := func(buf *redact.StringBuilder) error { - hasData := false - err := engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { + rangeTombstones, err := MVCCScanRangeTombstones( + ctx, engine, span.Key, span.EndKey, hlc.Timestamp{}) + require.NoError(t, err) + for _, rt := range rangeTombstones { + buf.Printf("range tombstone: %s\n", rt) + } + hasData := len(rangeTombstones) > 0 + + err = engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { hasData = true if r.Key.Timestamp.IsEmpty() { // Meta is at timestamp zero. @@ -396,15 +407,18 @@ var commands = map[string]cmd{ // TODO(nvanbenschoten): test "resolve_intent_range". "check_intent": {typReadOnly, cmdCheckIntent}, - "clear_range": {typDataUpdate, cmdClearRange}, - "cput": {typDataUpdate, cmdCPut}, - "del": {typDataUpdate, cmdDelete}, - "del_range": {typDataUpdate, cmdDeleteRange}, - "get": {typReadOnly, cmdGet}, - "increment": {typDataUpdate, cmdIncrement}, - "merge": {typDataUpdate, cmdMerge}, - "put": {typDataUpdate, cmdPut}, - "scan": {typReadOnly, cmdScan}, + "clear_range": {typDataUpdate, cmdClearRange}, + "clear_range_ts": {typDataUpdate, cmdClearRangeTombstone}, + "cput": {typDataUpdate, cmdCPut}, + "del": {typDataUpdate, cmdDelete}, + "del_range": {typDataUpdate, cmdDeleteRange}, + "del_range_ts": {typDataUpdate, cmdDeleteRangeTombstone}, + "get": {typReadOnly, cmdGet}, + "increment": {typDataUpdate, cmdIncrement}, + "merge": {typDataUpdate, cmdMerge}, + "put": {typDataUpdate, cmdPut}, + "scan": {typReadOnly, cmdScan}, + "scan_range_ts": {typReadOnly, cmdScanRangeTombstone}, } func cmdTxnAdvance(e *evalCtx) error { @@ -584,6 +598,16 @@ func cmdClearRange(e *evalCtx) error { return e.engine.ClearMVCCRangeAndIntents(key, endKey) } +func cmdClearRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + return e.engine.ExperimentalClearMVCCRangeKey(MVCCRangeKey{ + StartKey: key, + EndKey: endKey, + Timestamp: ts, + }) +} + func cmdCPut(e *evalCtx) error { txn := e.getTxn(optional) ts := e.getTs(txn) @@ -660,6 +684,24 @@ func cmdDeleteRange(e *evalCtx) error { }) } +func cmdDeleteRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + + return e.withWriter("del_range", func(rw ReadWriter) error { + err := ExperimentalMVCCDeleteRangeUsingTombstone(e.ctx, rw, nil, key, endKey, ts, 0) + if err != nil { + return err + } + e.results.buf.Printf("del_range_ts: %s\n", MVCCRangeKey{ + StartKey: key, + EndKey: endKey, + Timestamp: ts, + }) + return nil + }) +} + func cmdGet(e *evalCtx) error { txn := e.getTxn(optional) key := e.getKey() @@ -824,6 +866,23 @@ func cmdScan(e *evalCtx) error { return err } +func cmdScanRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + + tombstones, err := MVCCScanRangeTombstones(e.ctx, e.engine, key, endKey, ts) + if err != nil { + return err + } + for _, tombstone := range tombstones { + e.results.buf.Printf("scan_range_ts: %s\n", tombstone) + } + if len(tombstones) == 0 { + e.results.buf.Printf("scan_range_ts: %v-%v -> \n", key, endKey) + } + return nil +} + // evalCtx stored the current state of the environment of a running // script. type evalCtx struct { diff --git a/pkg/storage/mvcc_incremental_iterator.go b/pkg/storage/mvcc_incremental_iterator.go index 0beb1b464ed5..a199c7329cd9 100644 --- a/pkg/storage/mvcc_incremental_iterator.go +++ b/pkg/storage/mvcc_incremental_iterator.go @@ -485,6 +485,21 @@ func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey { return i.iter.UnsafeKey() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + // UnsafeValue returns the same value as Value, but the memory is invalidated on // the next call to {Next,Reset,Close}. func (i *MVCCIncrementalIterator) UnsafeValue() []byte { diff --git a/pkg/storage/mvcc_key.go b/pkg/storage/mvcc_key.go index 02cb4016ba92..caaa94a693e7 100644 --- a/pkg/storage/mvcc_key.go +++ b/pkg/storage/mvcc_key.go @@ -79,6 +79,11 @@ func (k MVCCKey) Equal(l MVCCKey) bool { return k.Key.Compare(l.Key) == 0 && k.Timestamp.EqOrdering(l.Timestamp) } +// IsEmpty returns true iff there is a non-zero key or timestamp. +func (k MVCCKey) IsEmpty() bool { + return k.Key == nil && k.Timestamp.IsEmpty() +} + // IsValue returns true iff the timestamp is non-zero. func (k MVCCKey) IsValue() bool { return !k.Timestamp.IsEmpty() @@ -287,3 +292,68 @@ func decodeMVCCTimestampSuffix(encodedTS []byte) (hlc.Timestamp, error) { } return decodeMVCCTimestamp(encodedTS[:encodedLen-1]) } + +// MVCCRangeKey is an MVCC key span at a timestamp. +type MVCCRangeKey struct { + StartKey roachpb.Key + EndKey roachpb.Key + Timestamp hlc.Timestamp +} + +// Clone returns a copy of the range key. +func (k MVCCRangeKey) Clone() MVCCRangeKey { + // k is already a copy, but needs key clones. + k.StartKey = k.StartKey.Clone() + k.EndKey = k.EndKey.Clone() + return k +} + +// Compare returns -1 if this key is less than the given key, 0 if they're +// equal, or 1 if the given key is greater than this. Comparison is by +// start,timestamp,end, where larger timestamps sort before smaller ones except +// empty ones which sort first (like elsewhere in MVCC). +func (k MVCCRangeKey) Compare(o MVCCRangeKey) int { + if c := k.StartKey.Compare(o.StartKey); c != 0 { + return c + } + if k.Timestamp.IsEmpty() && !o.Timestamp.IsEmpty() { + return -1 + } else if !k.Timestamp.IsEmpty() && o.Timestamp.IsEmpty() { + return 1 + } else if k.Timestamp.Less(o.Timestamp) { + return 1 + } else if !k.Timestamp.EqOrdering(o.Timestamp) { + return -1 + } + return k.EndKey.Compare(o.EndKey) +} + +// String formats the range key. +func (k MVCCRangeKey) String() string { + s := roachpb.Span{Key: k.StartKey, EndKey: k.EndKey}.String() + if !k.Timestamp.IsEmpty() { + s += fmt.Sprintf("/%s", k.Timestamp) + } + return s +} + +// Validate returns an error if the range key is invalid. +func (k MVCCRangeKey) Validate() (err error) { + return errors.Wrapf(k.validate(), "invalid range key %s", k) +} + +func (k MVCCRangeKey) validate() error { + if k.StartKey == nil { + return errors.Errorf("no start key") + } + if k.EndKey == nil { + return errors.Errorf("no end key") + } + if k.Timestamp.IsEmpty() { + return errors.Errorf("no timestamp") + } + if k.StartKey.Compare(k.EndKey) > 0 { + return errors.Errorf("start key %s is after end key %s", k.StartKey, k.EndKey) + } + return nil +} diff --git a/pkg/storage/mvcc_key_test.go b/pkg/storage/mvcc_key_test.go index 63af9beda842..63ecfb7c5e14 100644 --- a/pkg/storage/mvcc_key_test.go +++ b/pkg/storage/mvcc_key_test.go @@ -238,3 +238,94 @@ func BenchmarkDecodeMVCCKey(b *testing.B) { } benchmarkDecodeMVCCKeyResult = mvccKey // avoid compiler optimizing away function call } + +func TestMVCCRangeKeyString(t *testing.T) { + defer leaktest.AfterTest(t)() + + testcases := map[string]struct { + rk MVCCRangeKey + expect string + }{ + "empty": {MVCCRangeKey{}, "/Min"}, + "only start": {MVCCRangeKey{StartKey: roachpb.Key("foo")}, "foo"}, + "only end": {MVCCRangeKey{EndKey: roachpb.Key("foo")}, "{/Min-foo}"}, + "only timestamp": {MVCCRangeKey{Timestamp: hlc.Timestamp{Logical: 1}}, "/Min/0,1"}, + "only span": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z")}, "{a-z}"}, + "all": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z"), Timestamp: hlc.Timestamp{Logical: 1}}, "{a-z}/0,1"}, + "all overlapping": {MVCCRangeKey{StartKey: roachpb.Key("ab"), EndKey: roachpb.Key("af"), Timestamp: hlc.Timestamp{Logical: 1}}, "a{b-f}/0,1"}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.rk.String()) + }) + } +} + +func TestMVCCRangeKeyCompare(t *testing.T) { + defer leaktest.AfterTest(t)() + + ab1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("b"), hlc.Timestamp{Logical: 1}} + ac1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + ac2 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 2}} + bc0 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 0}} + bc1 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + bc3 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 3}} + bd4 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("d"), hlc.Timestamp{Logical: 4}} + + testcases := map[string]struct { + a MVCCRangeKey + b MVCCRangeKey + expect int + }{ + "equal": {ac1, ac1, 0}, + "start lt": {ac1, bc1, -1}, + "start gt": {bc1, ac1, 1}, + "end lt": {ab1, ac1, -1}, + "end gt": {ac1, ab1, 1}, + "time lt": {ac2, ac1, -1}, // MVCC timestamps sort in reverse order + "time gt": {ac1, ac2, 1}, // MVCC timestamps sort in reverse order + "empty time lt set": {bc0, bc1, -1}, // empty MVCC timestamps sort before non-empty + "set time gt empty": {bc1, bc0, 1}, // empty MVCC timestamps sort before non-empty + "start time precedence": {ac2, bc3, -1}, + "time end precedence": {bd4, bc3, -1}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.a.Compare(tc.b)) + }) + } +} + +func TestMVCCRangeKeyValidate(t *testing.T) { + defer leaktest.AfterTest(t)() + + a := roachpb.Key("a") + b := roachpb.Key("b") + blank := roachpb.Key("") + ts1 := hlc.Timestamp{Logical: 1} + + testcases := map[string]struct { + rangeKey MVCCRangeKey + expectErr string // empty if no error + }{ + "valid": {MVCCRangeKey{StartKey: a, EndKey: b, Timestamp: ts1}, ""}, + "start at end": {MVCCRangeKey{StartKey: a, EndKey: b, Timestamp: ts1}, ""}, + "blank keys": {MVCCRangeKey{StartKey: blank, EndKey: blank, Timestamp: ts1}, ""}, // equivalent to MinKey + "no start": {MVCCRangeKey{EndKey: b, Timestamp: ts1}, "{/Min-b}/0,1: no start key"}, + "no end": {MVCCRangeKey{StartKey: a, Timestamp: ts1}, "a/0,1: no end key"}, + "no timestamp": {MVCCRangeKey{StartKey: a, EndKey: b}, "{a-b}: no timestamp"}, + "empty": {MVCCRangeKey{}, "/Min: no start key"}, + "end before start": {MVCCRangeKey{StartKey: b, EndKey: a, Timestamp: ts1}, `{b-a}/0,1: start key "b" is after end key "a"`}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + err := tc.rangeKey.Validate() + if tc.expectErr == "" { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Contains(t, err.Error(), tc.expectErr) + } + }) + } +} diff --git a/pkg/storage/mvcc_range_tombstone_iterator.go b/pkg/storage/mvcc_range_tombstone_iterator.go new file mode 100644 index 000000000000..992bc5b5d862 --- /dev/null +++ b/pkg/storage/mvcc_range_tombstone_iterator.go @@ -0,0 +1,289 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "bytes" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/errors" +) + +// MVCCRangeTombstoneIterOptions are options for an MVCCRangeTombstoneIterator. +type MVCCRangeTombstoneIterOptions struct { + // LowerBound sets the inclusive lower bound of the iterator. Tombstones that + // straddle the it will have their start key truncated to the lower bound. + // + // NB: It may be tempting to use an MVCCKey here and include a timestamp, but + // this would be useless: giving e.g. a@4 would skip a tombstone starting at + // a@5, but the tombstone would logically exist at the adjacent a.Next()@5 so + // it would be emitted almost immediately anyway. + LowerBound roachpb.Key + // UpperBound sets the exclusive upper bound of the iterator. Tombstones that + // straddle it will have their end key truncated to the upper bound. + UpperBound roachpb.Key + // MinTimestamp sets the inclusive lower timestamp bound for the iterator. + MinTimestamp hlc.Timestamp + // MaxTimestamp sets the inclusive upper timestamp bound for the iterator. + MaxTimestamp hlc.Timestamp + // Fragmented will emit tombstone fragments as they are stored in Pebble. + // Fragments typically begin and end where a tombstone bound overlaps with + // another tombstone, for all overlapping tombstones. However, fragmentation + // is non-deterministic as it also depends on Pebble's internal SST structure + // and mutation history. + // + // When enabled, this results in an iteration order of StartKey,Timestamp as + // opposed to the normal EndKey,Timestamp order for range tombstones. This may + // be useful for partial results and resumption, e.g. resume spans. + Fragmented bool +} + +// MVCCRangeTombstoneIterator iterates over range tombstones in an engine, +// defragmenting them into contiguous range tombstones. It does not support +// seeking or backtracking, see RangeTombstoneIterOptions for lower/upper bounds +// and other options. +// +// Iteration uses EndKey,Timestamp order rather than StartKey,Timestamp. For +// example, [a-z)@3 will be emitted after [c-e)@2, but before [x-z)@1. This is a +// memory optimization when defragmenting Pebble range keys, to allow emitting +// tombstones as soon as possible. Otherwise, a single tombstone across the the +// entire key span would require all other tombstones at other timestamps to be +// buffered in memory before they could be emitted. However, see the Fragmented +// option which emits non-deterministic fragments in StartKey,Timestamp order. +type MVCCRangeTombstoneIterator struct { + iter MVCCIterator + opts MVCCRangeTombstoneIterOptions + incomplete []*MVCCRangeKey // defragmentation buffer + complete []MVCCRangeKey // queued for emission + completeIdx int // current Key() + iterDone bool // TODO(erikgrinaker): remove this + err error +} + +// NewMVCCRangeTombstoneIterator sets up a new MVCRangeTombstoneIterator and +// seeks to the first range tombstone. The caller must call Close() when done. +func NewMVCCRangeTombstoneIterator( + r Reader, opts MVCCRangeTombstoneIterOptions, +) *MVCCRangeTombstoneIterator { + iter := &MVCCRangeTombstoneIterator{ + iter: r.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypeRangesOnly, + LowerBound: EncodeMVCCKey(MVCCKey{Key: opts.LowerBound}), + UpperBound: EncodeMVCCKey(MVCCKey{Key: opts.UpperBound}), + // TODO(erikgrinaker): We do not set Min/MaxTimestampHint here, because + // both are required and it's apparently not always safe to use. + }), + opts: opts, + incomplete: make([]*MVCCRangeKey, 0), + complete: make([]MVCCRangeKey, 0), + } + + // Seek the iterator to the lower bound and iterate until we've collected + // the first complete range tombstone (if any). + iter.iter.SeekGE(MVCCKey{Key: opts.LowerBound}) + iter.findCompleteTombstones() + + return iter +} + +// findCompleteTombstones processes range keys at the current iterator position +// and any subsequent iterator positions until it completes one or more +// tombstones, populating completeTombstones. Current completeTombstones are +// discarded. +func (p *MVCCRangeTombstoneIterator) findCompleteTombstones() { + p.complete = p.complete[:0] + p.completeIdx = 0 + p.updateTombstones() + + for len(p.complete) == 0 && !p.iterDone { + if ok, err := p.iter.Valid(); err != nil { + p.err = err + return + } else if !ok { + break + } + p.iter.Next() + // NB: We update tombstones even if Next() invalidates the iterator, because + // there may be incomplete tombstones that become complete when the iterator + // is exhausted. + p.updateTombstones() + } +} + +// updateTombstones inspects the range keys at the current Pebble iterator +// position, tracks tombstones in incompleteTombstones, and moves any +// completed tombstones into completeTombstones. +func (p *MVCCRangeTombstoneIterator) updateTombstones() { + var startKey, endKey roachpb.Key + var rangeKeys []MVCCRangeKeyValue + + // If the iterator is exhausted, we still want to complete any remaining + // incomplete tombstones. + if ok, err := p.iter.Valid(); err != nil { + p.err = err + return + } else if ok { + startKey, endKey = p.iter.RangeBounds() + rangeKeys = p.iter.RangeKeys() + + // TODO(erikgrinaker): Pebble does not yet truncate range keys to the + // LowerBound or UpperBound of the range, so we truncate them here. + if p.opts.LowerBound != nil && bytes.Compare(startKey, p.opts.LowerBound) < 0 { + startKey = p.opts.LowerBound + } + if p.opts.UpperBound != nil && bytes.Compare(endKey, p.opts.UpperBound) > 0 { + endKey = p.opts.UpperBound + } + } + + // TODO(erikgrinaker): Pebble does not yet respect UpperBound for range keys, + // and seems to go into an infinite loop if we try to exhaust the iterator + // here, so we use p.iterDone to mark it as done. + if len(p.opts.UpperBound) > 0 && bytes.Compare(startKey, p.opts.UpperBound) >= 0 { + p.iterDone = true + startKey, endKey, rangeKeys = nil, nil, nil + } + + // Both RangeKeys and incompleteTombstones are sorted in descending suffix + // order, so we iterate over them in lockstep and insert/update/delete + // incompleteTombstones as appropriate. + var tsIdx, rkIdx int + + for rkIdx < len(rangeKeys) { + rangeKey := rangeKeys[rkIdx] + + // Error on non-tombstone range keys. We expect all range keys to be range + // tombstones currently. + if rangeKey.Value != nil { + p.err = errors.Errorf("unexpected value for range key %s, expected nil: %x", + rangeKey.Key, rangeKey.Value) + return + } + + // Filter rangekeys by suffix. + // + // TODO(erikgrinaker): This can be optimized by skipping unnecessary + // comparisons since rangeKeys is sorted by suffix. Maybe later. + if !p.opts.MinTimestamp.IsEmpty() && rangeKey.Key.Timestamp.Less(p.opts.MinTimestamp) { + rkIdx++ + continue + } + if !p.opts.MaxTimestamp.IsEmpty() && p.opts.MaxTimestamp.Less(rangeKey.Key.Timestamp) { + rkIdx++ + continue + } + + // If we're at the end of incompleteTombstones, this range tombstone must be new. + if tsIdx >= len(p.incomplete) { + p.incomplete = append(p.incomplete, &MVCCRangeKey{ + StartKey: append(make([]byte, 0, len(startKey)), startKey...), + EndKey: append(make([]byte, 0, len(endKey)), endKey...), + Timestamp: rangeKey.Key.Timestamp, + }) + rkIdx++ + tsIdx++ + continue + } + + incomplete := p.incomplete[tsIdx] + cmp := incomplete.Timestamp.Compare(rangeKey.Key.Timestamp) + switch { + // If the timestamps match and the key spans are adjacent or overlapping, + // this range key extends the incomplete tombstone. + case cmp == 0 && bytes.Compare(startKey, incomplete.EndKey) <= 0: + incomplete.EndKey = append(incomplete.EndKey[:0], endKey...) + tsIdx++ + rkIdx++ + + // This is a different tombstone at the same suffix: complete the existing + // tombstone and start a new one. + case cmp == 0: + p.complete = append(p.complete, *incomplete) + // NB: can't reuse slices, as they were placed in the completed tombstone. + incomplete.StartKey = append(make([]byte, 0, len(startKey)), startKey...) + incomplete.EndKey = append(make([]byte, 0, len(endKey)), endKey...) + + // This incomplete tombstone is not present at this range key: complete it + // and remove it from the list, then try again. + case cmp == 1: + p.complete = append(p.complete, *incomplete) + p.incomplete = append(p.incomplete[:tsIdx], p.incomplete[tsIdx+1:]...) + + // This range key is a new incomplete tombstone: start tracking it. + case cmp == -1: + p.incomplete = append(p.incomplete[:tsIdx+1], p.incomplete[tsIdx:]...) + p.incomplete[tsIdx] = &MVCCRangeKey{ + StartKey: append(make(roachpb.Key, 0, len(startKey)), startKey...), + EndKey: append(make(roachpb.Key, 0, len(endKey)), endKey...), + Timestamp: rangeKey.Key.Timestamp, + } + tsIdx++ + rkIdx++ + + default: + p.err = errors.Errorf("unexpected comparison result %d", cmp) + return + } + } + + // If the caller has requested tombstone fragments, we complete all tombstones + // we found during this iteration by resetting tsIdx to 0. The loop below will + // handle the rest. + if p.opts.Fragmented { + tsIdx = 0 + } + + // If there are any remaining incomplete tombstones, they must be complete: + // make them so. + for _, ts := range p.incomplete[tsIdx:] { + p.complete = append(p.complete, *ts) + } + p.incomplete = p.incomplete[:tsIdx] +} + +// Close frees up resources held by the iterator. +func (p *MVCCRangeTombstoneIterator) Close() { + p.iter.Close() + p.complete = nil + p.completeIdx = 0 +} + +// Next iterates to the next range tombstone. Note the unusual iteration +// order, see struct comment for details. +func (p *MVCCRangeTombstoneIterator) Next() { + p.completeIdx++ + if p.completeIdx >= len(p.complete) { + p.iter.Next() + // NB: Called even if Next() fails, because we may have incomplete + // tombstones that become complete when the iterator is exhausted. + p.findCompleteTombstones() + } +} + +// Key returns the current range tombstone. It will not be invalidated by the +// iterator, but will be shared by all callers. +func (p *MVCCRangeTombstoneIterator) Key() MVCCRangeKey { + return p.complete[p.completeIdx] +} + +// Valid returns (true, nil) if the iterator points to a valid key, (false, nil) +// if the iterator is exhausted, or (false, error) if an error occurred during +// iteration. +func (p *MVCCRangeTombstoneIterator) Valid() (bool, error) { + if p.err != nil { + return false, p.err + } + if _, err := p.iter.Valid(); err != nil { + return false, err + } + return p.completeIdx < len(p.complete), nil +} diff --git a/pkg/storage/mvcc_range_tombstone_iterator_test.go b/pkg/storage/mvcc_range_tombstone_iterator_test.go new file mode 100644 index 000000000000..1723d424ef21 --- /dev/null +++ b/pkg/storage/mvcc_range_tombstone_iterator_test.go @@ -0,0 +1,170 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/stretchr/testify/require" +) + +func TestMVCCRangeTombstoneIterator(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + eng := NewDefaultInMemForTesting() + defer eng.Close() + + rangeKeys := []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("e", "g", 3), + rangeKey("d", "f", 5), + rangeKey("d", "f", 2), + rangeKey("a", "m", 4), + rangeKey("m", "z", 4), + } + for _, rk := range rangeKeys { + require.NoError(t, ExperimentalMVCCDeleteRangeUsingTombstone( + ctx, eng, nil, rk.StartKey, rk.EndKey, rk.Timestamp, 0)) + } + + testcases := map[string]struct { + opts MVCCRangeTombstoneIterOptions + expect []MVCCRangeKey + }{ + "all tombstones": { + MVCCRangeTombstoneIterOptions{}, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("d", "f", 5), + rangeKey("d", "f", 2), + rangeKey("e", "g", 3), + rangeKey("a", "z", 4), + }}, + "truncated tombstones": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("c"), + UpperBound: roachpb.Key("e"), + }, + []MVCCRangeKey{ + rangeKey("d", "e", 5), + rangeKey("c", "e", 4), + rangeKey("d", "e", 2), + }}, + "truncation between tombstone bounds": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("ccc"), + UpperBound: roachpb.Key("eee"), + }, + []MVCCRangeKey{ + rangeKey("d", "eee", 5), + rangeKey("ccc", "eee", 4), + rangeKey("e", "eee", 3), + rangeKey("d", "eee", 2), + }}, + "fragmented tombstones": { + MVCCRangeTombstoneIterOptions{ + Fragmented: true, + }, + []MVCCRangeKey{ + rangeKey("a", "b", 4), + rangeKey("b", "c", 4), + rangeKey("b", "c", 3), + rangeKey("c", "d", 4), + rangeKey("d", "e", 5), + rangeKey("d", "e", 4), + rangeKey("d", "e", 2), + rangeKey("e", "f", 5), + rangeKey("e", "f", 4), + rangeKey("e", "f", 3), + rangeKey("e", "f", 2), + rangeKey("f", "g", 4), + rangeKey("f", "g", 3), + rangeKey("g", "m", 4), + rangeKey("m", "z", 4), + }}, + "empty interval": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("A"), + UpperBound: roachpb.Key("Z"), + }, + nil}, + "zero-length interval": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("c"), + UpperBound: roachpb.Key("c"), + }, + nil}, + "end after start": { + MVCCRangeTombstoneIterOptions{ + LowerBound: roachpb.Key("e"), + UpperBound: roachpb.Key("d"), + }, + nil}, + "min timestamp": { + MVCCRangeTombstoneIterOptions{ + MinTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("d", "f", 5), + rangeKey("e", "g", 3), + rangeKey("a", "z", 4), + }}, + "max timestamp": { + MVCCRangeTombstoneIterOptions{ + MaxTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("d", "f", 2), + rangeKey("e", "g", 3), + }}, + "both timestamps": { + MVCCRangeTombstoneIterOptions{ + MinTimestamp: hlc.Timestamp{Logical: 3}, + MaxTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKey{ + rangeKey("b", "c", 3), + rangeKey("e", "g", 3), + }}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + var tombstones []MVCCRangeKey + iter := NewMVCCRangeTombstoneIterator(eng, tc.opts) + defer iter.Close() + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + tombstones = append(tombstones, iter.Key()) + iter.Next() + } + require.Equal(t, tc.expect, tombstones) + }) + } +} + +func rangeKey(start, end string, ts int) MVCCRangeKey { + return MVCCRangeKey{ + StartKey: roachpb.Key(start), + EndKey: roachpb.Key(end), + Timestamp: hlc.Timestamp{Logical: int32(ts)}, + } +} diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 02e0bbdc8cbb..f28d1932f4bc 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -481,6 +481,8 @@ func DefaultPebbleOptions() *pebble.Options { TablePropertyCollectors: PebbleTablePropertyCollectors, BlockPropertyCollectors: PebbleBlockPropertyCollectors, } + // Used for experimental MVCC range tombstones. + opts.Experimental.RangeKeys = new(pebble.RangeKeysArena) // Automatically flush 10s after the first range tombstone is added to a // memtable. This ensures that we can reclaim space even when there's no // activity on the database generating flushes. @@ -1108,6 +1110,24 @@ func (p *Pebble) ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error return batch.Commit(true) } +// ExperimentalDeleteMVCCRange implements the Engine interface. +func (p *Pebble) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeySet(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), value, pebble.Sync) +} + +// ExperimentalClearMVCCRangeTombstone implements the Engine interface. +func (p *Pebble) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), pebble.Sync) +} + // Merge implements the Engine interface. func (p *Pebble) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { @@ -1882,6 +1902,14 @@ func (p *pebbleReadOnly) ClearIterRange(iter MVCCIterator, start, end roachpb.Ke panic("not implemented") } +func (p *pebbleReadOnly) ExperimentalPutMVCCRangeKey(_ MVCCRangeKey, _ []byte) error { + panic("not implemented") +} + +func (p *pebbleReadOnly) ExperimentalClearMVCCRangeKey(_ MVCCRangeKey) error { + panic("not implemented") +} + func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error { panic("not implemented") } diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index 117c2995cfa8..5de38132298e 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -413,6 +413,24 @@ func (p *pebbleBatch) ClearIterRange(iter MVCCIterator, start, end roachpb.Key) return nil } +// ExperimentalDeleteMVCCRange implements the Batch interface. +func (p *pebbleBatch) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.batch.Experimental().RangeKeySet(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), value, nil) +} + +// ExperimentalClearMVCCRangeTombstone implements the Engine interface. +func (p *pebbleBatch) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), nil) +} + // Merge implements the Batch interface. func (p *pebbleBatch) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index cc4a5ebe502e..32d1fdd94c90 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -166,6 +166,8 @@ func (p *pebbleIterator) init(handle pebble.Reader, iterToClone cloneableIter, o panic("min timestamp hint set without max timestamp hint") } + p.options.KeyTypes = opts.KeyTypes + if doClone { var err error if p.iter, err = iterToClone.Clone(); err != nil { @@ -575,6 +577,39 @@ func (p *pebbleIterator) ValueProto(msg protoutil.Message) error { return protoutil.Unmarshal(value, msg) } +// HasPointAndRange implements the MVCCIterator interface. +func (p *pebbleIterator) HasPointAndRange() (bool, bool) { + return p.iter.HasPointAndRange() +} + +// RangeBounds implements the MVCCIterator interface. +func (p *pebbleIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + return p.iter.RangeBounds() +} + +// RangeKeys implements the MVCCIterator interface. +func (p *pebbleIterator) RangeKeys() []MVCCRangeKeyValue { + start, end := p.iter.RangeBounds() + rangeKeys := p.iter.RangeKeys() + rangeValues := make([]MVCCRangeKeyValue, 0, len(rangeKeys)) + + for _, rangeKey := range rangeKeys { + timestamp, err := decodeMVCCTimestampSuffix(rangeKey.Suffix) + if err != nil { + panic(err) // TODO(erikgrinaker): Don't panic. + } + rangeValues = append(rangeValues, MVCCRangeKeyValue{ + Key: MVCCRangeKey{ + StartKey: start, + EndKey: end, + Timestamp: timestamp, + }, + Value: rangeKey.Value, + }) + } + return rangeValues +} + // ComputeStats implements the MVCCIterator interface. func (p *pebbleIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, diff --git a/pkg/storage/sst_iterator.go b/pkg/storage/sst_iterator.go index 9bd8b49b1b39..890a8cbb1d70 100644 --- a/pkg/storage/sst_iterator.go +++ b/pkg/storage/sst_iterator.go @@ -158,3 +158,18 @@ func (r *sstIterator) UnsafeKey() MVCCKey { func (r *sstIterator) UnsafeValue() []byte { return r.value } + +// HasPointAndRange implements SimpleMVCCIterator. +func (r *sstIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (r *sstIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (s *sstIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} diff --git a/pkg/storage/sst_writer.go b/pkg/storage/sst_writer.go index 9b58c8afebc1..e4eb3ba55108 100644 --- a/pkg/storage/sst_writer.go +++ b/pkg/storage/sst_writer.go @@ -118,6 +118,24 @@ func (fw *SSTWriter) ClearMVCCRange(start, end MVCCKey) error { return fw.clearRange(start, end) } +// ExperimentalDeleteMVCCRange implements the Writer interface. +func (fw *SSTWriter) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return fw.fw.RangeKeySet(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp), value) +} + +// ExperimentalClearMVCCRangeTombstone implements the Writer interface. +func (fw *SSTWriter) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return fw.fw.RangeKeyUnset(rangeKey.StartKey, rangeKey.EndKey, + encodeMVCCTimestampSuffix(rangeKey.Timestamp)) +} + func (fw *SSTWriter) clearRange(start, end MVCCKey) error { if fw.fw == nil { return errors.New("cannot call ClearRange on a closed writer") diff --git a/pkg/storage/testdata/mvcc_histories/delete_range_tombstone b/pkg/storage/testdata/mvcc_histories/delete_range_tombstone new file mode 100644 index 000000000000..df1121b79e21 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/delete_range_tombstone @@ -0,0 +1,111 @@ +# TODO(erikgrinaker): The MVCC API does not respect range tombstones yet, so +# we don't test point keys because they remain unaffected. +# TODO(erikgrinaker): This needs conflict tests, implement later. + +# Write some range tombstones. Some will abut and merge. +run ok +del_range_ts k=b end=c ts=3 +del_range_ts k=e end=g ts=3 +del_range_ts k=d end=f ts=5 +del_range_ts k=d end=f ts=2 +del_range_ts k=m end=z ts=1 +del_range_ts k=a end=m ts=4 +del_range_ts k=m end=z ts=4 +---- +del_range_ts: {b-c}/3.000000000,0 +del_range_ts: {e-g}/3.000000000,0 +del_range_ts: {d-f}/5.000000000,0 +del_range_ts: {d-f}/2.000000000,0 +del_range_ts: {m-z}/1.000000000,0 +del_range_ts: {a-m}/4.000000000,0 +del_range_ts: {m-z}/4.000000000,0 +>> at end: +range tombstone: {b-c}/3.000000000,0 +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {e-g}/3.000000000,0 +range tombstone: {a-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 + +# Scan all tombstones. +run ok +scan_range_ts k=a end=z +---- +scan_range_ts: {b-c}/3.000000000,0 +scan_range_ts: {d-f}/5.000000000,0 +scan_range_ts: {d-f}/2.000000000,0 +scan_range_ts: {e-g}/3.000000000,0 +scan_range_ts: {a-z}/4.000000000,0 +scan_range_ts: {m-z}/1.000000000,0 + +# Scan truncates tombstones to scan bounds. +run ok +scan_range_ts k=c end=e +---- +scan_range_ts: {d-e}/5.000000000,0 +scan_range_ts: {c-e}/4.000000000,0 +scan_range_ts: {d-e}/2.000000000,0 + +# Scan truncates tombstones to scan bounds when not on tombstone bounds. +run ok +scan_range_ts k=ccc end=eee +---- +scan_range_ts: {d-eee}/5.000000000,0 +scan_range_ts: {ccc-eee}/4.000000000,0 +scan_range_ts: e{-ee}/3.000000000,0 +scan_range_ts: {d-eee}/2.000000000,0 + +# Scan at a timestamp. +run ok +scan_range_ts k=a end=z ts=3 +---- +scan_range_ts: {b-c}/3.000000000,0 +scan_range_ts: {d-f}/2.000000000,0 +scan_range_ts: {e-g}/3.000000000,0 +scan_range_ts: {m-z}/1.000000000,0 + +# Empty scans. +run ok +scan_range_ts k=A end=Z +scan_range_ts k=c end=c +scan_range_ts k=z end=a +---- +scan_range_ts: "A"-"Z" -> +scan_range_ts: "c"-"c" -> +scan_range_ts: "z"-"a" -> + +# Remove some tombstones, both a non-existant one and a span across two +# tombstones. +run ok +clear_range_ts k=a end=z ts=10 +clear_range_ts k=b end=g ts=3 +---- +>> at end: +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {a-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 + +# Remove the middle section of the [a-z)@4 tombstone. Do it twice, to +# make sure clears are idempotent. +run ok +clear_range_ts k=k end=n ts=4 +clear_range_ts k=k end=n ts=4 +---- +>> at end: +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {a-k}/4.000000000,0 +range tombstone: {n-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 + +# Remove portions of the [a-k)@4 and [n-z)@4 tombstones in one operation. +run ok +clear_range_ts k=eee end=ttt ts=4 +---- +>> at end: +range tombstone: {a-eee}/4.000000000,0 +range tombstone: {d-f}/5.000000000,0 +range tombstone: {d-f}/2.000000000,0 +range tombstone: {ttt-z}/4.000000000,0 +range tombstone: {m-z}/1.000000000,0 diff --git a/pkg/util/hlc/timestamp.go b/pkg/util/hlc/timestamp.go index 0263ac1d3946..b84b39207fa6 100644 --- a/pkg/util/hlc/timestamp.go +++ b/pkg/util/hlc/timestamp.go @@ -54,6 +54,22 @@ func (t Timestamp) LessEq(s Timestamp) bool { return t.WallTime < s.WallTime || (t.WallTime == s.WallTime && t.Logical <= s.Logical) } +// Compare returns -1 if this timestamp is lesser than the given timestamp, 1 if +// it is greater, and 0 if they are equal. +func (t Timestamp) Compare(s Timestamp) int { + if t.WallTime > s.WallTime { + return 1 + } else if t.WallTime < s.WallTime { + return -1 + } else if t.Logical > s.Logical { + return 1 + } else if t.Logical < s.Logical { + return -1 + } else { + return 0 + } +} + // String implements the fmt.Stringer interface. func (t Timestamp) String() string { // The following code was originally written as