diff --git a/pkg/kv/kvserver/rangefeed/task_test.go b/pkg/kv/kvserver/rangefeed/task_test.go index 87b695a7a228..edf833d47904 100644 --- a/pkg/kv/kvserver/rangefeed/task_test.go +++ b/pkg/kv/kvserver/rangefeed/task_test.go @@ -190,6 +190,21 @@ func (s *testIterator) curKV() storage.MVCCKeyValue { return s.kvs[s.cur] } +// HasPointAndRange implements SimpleMVCCIterator. +func (s *testIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (s *testIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeTombstones implements SimpleMVCCIterator. +func (s *testIterator) RangeKeys() []storage.MVCCRangeKeyValue { + panic("not implemented") +} + func TestInitResolvedTSScan(t *testing.T) { defer leaktest.AfterTest(t)() startKey := roachpb.RKey("d") diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index ae012cb87a87..b44c26d3dee4 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -176,6 +176,21 @@ func (i *MVCCIterator) UnsafeValue() []byte { return i.i.UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIterator) RangeKeys() []storage.MVCCRangeKeyValue { + panic("not implemented") +} + // ComputeStats is part of the storage.MVCCIterator interface. func (i *MVCCIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, @@ -599,6 +614,16 @@ func (s spanSetWriter) ClearIterRange(iter storage.MVCCIterator, start, end roac return s.w.ClearIterRange(iter, start, end) } +func (s spanSetWriter) ExperimentalPutMVCCRangeKey( + rangeKey storage.MVCCRangeKey, value []byte, +) error { + panic("not implemented") +} + +func (s spanSetWriter) ExperimentalClearMVCCRangeKey(rangeKey storage.MVCCRangeKey) error { + panic("not implemented") +} + func (s spanSetWriter) Merge(key storage.MVCCKey, value []byte) error { if s.spansOnly { if err := s.spans.CheckAllowed(SpanReadWrite, roachpb.Span{Key: key.Key}); err != nil { diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index ea1d295af0fc..73b6017c8ace 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -21,6 +21,7 @@ go_library( "mvcc_incremental_iterator.go", "mvcc_key.go", "mvcc_logical_ops.go", + "mvcc_range_key_iterator.go", "open.go", "pebble.go", "pebble_batch.go", @@ -108,6 +109,7 @@ go_test( "mvcc_incremental_iterator_test.go", "mvcc_key_test.go", "mvcc_logical_ops_test.go", + "mvcc_range_key_iterator_test.go", "mvcc_stats_test.go", "mvcc_test.go", "pebble_file_registry_test.go", diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 5198a553ab36..039ec06a222b 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -67,11 +67,56 @@ type SimpleMVCCIterator interface { // reverse iteration to forward iteration. NextKey() // UnsafeKey returns the same value as Key, but the memory is invalidated on - // the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. + // the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. If the iterator + // is on a range key only, this returns the start bound of the range key. UnsafeKey() MVCCKey // UnsafeValue returns the same value as Value, but the memory is // invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. UnsafeValue() []byte + // HasPointAndRange returns whether the current iterator position has a point + // key and/or a range key. If Valid() returns true, one of these will be true. + // Range keys are only emitted when requested via IterOptions.KeyTypes. + HasPointAndRange() (bool, bool) + // RangeBounds returns the range bounds for the current range key fragment, if + // any. See RangeKeys() for more info on range key fragments. + // + // NB: It is possible for this to return overlapping range bounds in some + // corner cases, e.g. first [b-c\0) and then [c-d). These overlapping bounds + // will always have the same values, and will always be at the very start of + // the second bounds, i.e. the overlap is [start, start.Next()). + // + // TODO(erikgrinaker): Try to guarantee non-overlapping bounds, but see + // comments in pebbleIterator.RangeBounds(). + RangeBounds() (roachpb.Key, roachpb.Key) + // RangeKeys returns all range key fragments (at different timestamps) at the + // current key position. If we are at a point key, it will return all range + // keys that overlap that point key at any timestamp. + // + // For defragmented iteration, use MVCCRangeKeyIterator instead. Fragmented + // iteration is primarily useful in two cases: + // + // - To iterate over point keys while accessing overlapping range keys + // (e.g. to determine if it is hidden by a range tombstone). + // + // - For partial iteration with later resumption, e.g. Export requests with + // byte limits that have to return point and range key data for a partial + // key span and then resume from that point in a later request. + // + // Range keys are fragmented by Pebble such that all overlapping range keys + // between two fragment bounds form a "stack" of range key fragments at + // different timestamps. Fragmentation is desirable at the storage layer to + // store range keys across SSTs and CRDB ranges without incurring + // cross-SST/range access costs. Stacking is desirable to easily see all range + // keys that overlap with a given point, and to translate range keys from the + // 2D MVCC keyspan to the 1D Pebble keyspan. + // + // This fragmentation is non-deterministic, as it also depends on Pebble's + // internal SST structure (which changes with compactions) and the store's + // write history. They will also split and merge along with CRDB ranges, can + // be partially removed by GC, and may be truncated by iterator bounds. + // + // TODO(erikgrinaker): Write a tech note on range keys and link it here. + RangeKeys() []MVCCRangeKeyValue } // IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats. @@ -309,8 +354,27 @@ type IterOptions struct { // use such an iterator is to use it in concert with an iterator without // timestamp hints, as done by MVCCIncrementalIterator. MinTimestampHint, MaxTimestampHint hlc.Timestamp + // KeyTypes specifies the types of keys to surface: point and/or range keys. + // Use HasPointAndRange() to determine which key type is present at a given + // iterator position, and RangeBounds() and RangeKeys() to access range keys. + // For more info on range keys, see RangeKeys(). + KeyTypes IterKeyType } +// IterKeyType configures which types of keys an iterator should surface. +// +// TODO(erikgrinaker): Combine this with MVCCIterKind somehow. +type IterKeyType = pebble.IterKeyType + +const ( + // IterKeyTypePointsOnly iterates over point keys only. + IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly + // IterKeyTypePointsAndRanges iterates over both point and range keys. + IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges + // IterKeyTypeRangesOnly iterates over only range keys. + IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly +) + // MVCCIterKind is used to inform Reader about the kind of iteration desired // by the caller. type MVCCIterKind int @@ -584,6 +648,36 @@ type Writer interface { // returns. ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error + // ExperimentalClearMVCCRangeKey deletes an MVCC range key from start + // (inclusive) to end (exclusive) at the given timestamp. For any range key + // that straddles the start and end boundaries, only the segments within the + // boundaries will be cleared. Clears are idempotent. + // + // This method is primarily intended for MVCC garbage collection and similar + // internal use. It mutates MVCC history, and does not check for intents or + // other conflicts. + // + // This method is EXPERIMENTAL. Range keys are not supported throughout the + // MVCC API, and the on-disk format is unstable. + ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error + + // ExperimentalPutMVCCRangeKey writes a value to an MVCC range key. It is + // currently only used for range tombstones, which have a value of nil. Range + // keys exist separately from point keys in Pebble, and must be accessed via + // specialized iterator options and methods -- see e.g. IterOptions.KeyTypes, + // SimpleMVCCIterator.RangeKeys(), and MVCCRangeKeyIterator. + // + // A range key does not have a distinct identity, but should be considered a + // key continuum. They can be non-deterministically fragmented by Pebble, + // split/merged along with CRDB ranges, partially removed with + // ExperimentalClearMVCCRangeKey, and truncated during bounded iteration. + // + // TODO(erikgrinaker): Write a tech note on range keys and link it here. + // + // This function is EXPERIMENTAL. Range keys are not handled throughout the + // MVCC API, and the on-disk format is unstable. + ExperimentalPutMVCCRangeKey(MVCCRangeKey, []byte) error + // Merge is a high-performance write operation used for values which are // accumulated over several writes. Multiple values can be merged // sequentially into a single key; a subsequent read will return a "merged" diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index bf31221fbe93..834280186722 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -715,6 +715,21 @@ func (i *intentInterleavingIter) Value() []byte { return i.iter.Value() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *intentInterleavingIter) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *intentInterleavingIter) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + func (i *intentInterleavingIter) Close() { i.iter.Close() i.intentIter.Close() diff --git a/pkg/storage/multi_iterator.go b/pkg/storage/multi_iterator.go index 9838adf60ec7..6d2558e10c21 100644 --- a/pkg/storage/multi_iterator.go +++ b/pkg/storage/multi_iterator.go @@ -14,6 +14,7 @@ import ( "bytes" "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" ) const invalidIdxSentinel = -1 @@ -92,6 +93,21 @@ func (f *multiIterator) UnsafeValue() []byte { return f.iters[f.currentIdx].UnsafeValue() } +// HasPointAndRange implements SimpleMVCCIterator. +func (f *multiIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (f *multiIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (f *multiIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + // Next advances the iterator to the next key/value in the iteration. After this // call, Valid() will be true if the iterator was not positioned at the last // key. diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 64b0d1abaef2..7f121c962bbc 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -99,6 +99,12 @@ type MVCCKeyValue struct { Value []byte } +// MVCCRangeKeyValue represents a ranged key/value pair. +type MVCCRangeKeyValue struct { + Key MVCCRangeKey + Value []byte +} + // optionalValue represents an optional roachpb.Value. It is preferred // over a *roachpb.Value to avoid the forced heap allocation. type optionalValue struct { @@ -2202,6 +2208,33 @@ func MVCCDeleteRange( return keys, res.ResumeSpan, res.NumKeys, nil } +// ExperimentalMVCCDeleteRangeUsingTombstone deletes the given MVCC keyspan at +// the given timestamp using a range tombstone (rather than point tombstones). +// This operation is non-transactional, but will check for existing intents and +// return a WriteIntentError containing up to maxIntents intents. +// +// This function is EXPERIMENTAL. Range tombstones are not supported throughout +// the MVCC API, and the on-disk format is unstable. +// +// TODO(erikgrinaker): Needs conflict handling, e.g. WriteTooOldError. +// TODO(erikgrinaker): Needs MVCCStats handling. +func ExperimentalMVCCDeleteRangeUsingTombstone( + ctx context.Context, + rw ReadWriter, + ms *enginepb.MVCCStats, + startKey, endKey roachpb.Key, + timestamp hlc.Timestamp, + maxIntents int64, +) error { + if intents, err := ScanIntents(ctx, rw, startKey, endKey, maxIntents, 0); err != nil { + return err + } else if len(intents) > 0 { + return &roachpb.WriteIntentError{Intents: intents} + } + return rw.ExperimentalPutMVCCRangeKey(MVCCRangeKey{ + StartKey: startKey, EndKey: endKey, Timestamp: timestamp}, nil) +} + func recordIteratorStats(traceSpan *tracing.Span, iteratorStats IteratorStats) { stats := iteratorStats.Stats if traceSpan != nil { @@ -3929,3 +3962,35 @@ func ComputeStatsForRange( ms.LastUpdateNanos = nowNanos return ms, nil } + +// MVCCScanRangeTombstones returns a list of range tombstones across the given +// span at the given timestamp, in end,timestamp order rather that +// start,timestamp. Any tombstones that straddle the bounds will be truncated. +func MVCCScanRangeTombstones( + ctx context.Context, reader Reader, start, end roachpb.Key, ts hlc.Timestamp, +) ([]MVCCRangeKey, error) { + var tombstones []MVCCRangeKey + iter := NewMVCCRangeKeyIterator(reader, MVCCRangeKeyIterOptions{ + LowerBound: start, + UpperBound: end, + MaxTimestamp: ts, + }) + for { + ok, err := iter.Valid() + if err != nil { + return nil, err + } + if !ok { + break + } + if err := ctx.Err(); err != nil { + return nil, err + } + // Skip non-tombstone range keys (we don't expect these to exist currently). + if len(iter.Value()) == 0 { + tombstones = append(tombstones, iter.Key()) + } + iter.Next() + } + return tombstones, nil +} diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index cd3f571bf4e0..761c06320bec 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -55,17 +55,20 @@ import ( // resolve_intent t= k= [status=] // check_intent k= [none] // -// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] -// del [t=] [ts=[,]] [resolve [status=]] k= -// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] -// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] -// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] -// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] -// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] +// del [t=] [ts=[,]] [resolve [status=]] k= +// del_range [t=] [ts=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] +// del_range_ts [ts=[,]] k= end= +// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] +// increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] +// put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] +// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] [max=] [targetbytes=] [avoidExcess] [allowEmpty] +// scan_range_keys k= end= [ts=[,]] // // merge [ts=[,]] k= v= [raw] // -// clear_range k= end= +// clear_range k= end= +// clear_range_key k= end= [ts=[,]] // // Where `` can be a simple string, or a string // prefixed by the following characters: @@ -112,8 +115,25 @@ func TestMVCCHistories(t *testing.T) { defer engine.Close() reportDataEntries := func(buf *redact.StringBuilder) error { - hasData := false - err := engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { + var hasData bool + + iter := NewMVCCRangeKeyIterator(engine, MVCCRangeKeyIterOptions{ + LowerBound: span.Key, + UpperBound: span.EndKey, + }) + defer iter.Close() + for { + if ok, err := iter.Valid(); err != nil { + return err + } else if !ok { + break + } + hasData = true + buf.Printf("range key: %s -> %+v\n", iter.Key(), iter.Value()) + iter.Next() + } + + err = engine.MVCCIterate(span.Key, span.EndKey, MVCCKeyAndIntentsIterKind, func(r MVCCKeyValue) error { hasData = true if r.Key.Timestamp.IsEmpty() { // Meta is at timestamp zero. @@ -396,15 +416,18 @@ var commands = map[string]cmd{ // TODO(nvanbenschoten): test "resolve_intent_range". "check_intent": {typReadOnly, cmdCheckIntent}, - "clear_range": {typDataUpdate, cmdClearRange}, - "cput": {typDataUpdate, cmdCPut}, - "del": {typDataUpdate, cmdDelete}, - "del_range": {typDataUpdate, cmdDeleteRange}, - "get": {typReadOnly, cmdGet}, - "increment": {typDataUpdate, cmdIncrement}, - "merge": {typDataUpdate, cmdMerge}, - "put": {typDataUpdate, cmdPut}, - "scan": {typReadOnly, cmdScan}, + "clear_range": {typDataUpdate, cmdClearRange}, + "clear_range_key": {typDataUpdate, cmdClearRangeKey}, + "cput": {typDataUpdate, cmdCPut}, + "del": {typDataUpdate, cmdDelete}, + "del_range": {typDataUpdate, cmdDeleteRange}, + "del_range_ts": {typDataUpdate, cmdDeleteRangeTombstone}, + "get": {typReadOnly, cmdGet}, + "increment": {typDataUpdate, cmdIncrement}, + "merge": {typDataUpdate, cmdMerge}, + "put": {typDataUpdate, cmdPut}, + "scan": {typReadOnly, cmdScan}, + "scan_range_ts": {typReadOnly, cmdScanRangeTombstone}, } func cmdTxnAdvance(e *evalCtx) error { @@ -584,6 +607,16 @@ func cmdClearRange(e *evalCtx) error { return e.engine.ClearMVCCRangeAndIntents(key, endKey) } +func cmdClearRangeKey(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + return e.engine.ExperimentalClearMVCCRangeKey(MVCCRangeKey{ + StartKey: key, + EndKey: endKey, + Timestamp: ts, + }) +} + func cmdCPut(e *evalCtx) error { txn := e.getTxn(optional) ts := e.getTs(txn) @@ -660,6 +693,24 @@ func cmdDeleteRange(e *evalCtx) error { }) } +func cmdDeleteRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + + return e.withWriter("del_range", func(rw ReadWriter) error { + err := ExperimentalMVCCDeleteRangeUsingTombstone(e.ctx, rw, nil, key, endKey, ts, 0) + if err != nil { + return err + } + e.results.buf.Printf("del_range_ts: %s\n", MVCCRangeKey{ + StartKey: key, + EndKey: endKey, + Timestamp: ts, + }) + return nil + }) +} + func cmdGet(e *evalCtx) error { txn := e.getTxn(optional) key := e.getKey() @@ -832,6 +883,23 @@ func cmdScan(e *evalCtx) error { return err } +func cmdScanRangeTombstone(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + + tombstones, err := MVCCScanRangeTombstones(e.ctx, e.engine, key, endKey, ts) + if err != nil { + return err + } + for _, tombstone := range tombstones { + e.results.buf.Printf("scan_range_ts: %s\n", tombstone) + } + if len(tombstones) == 0 { + e.results.buf.Printf("scan_range_ts: %v-%v -> \n", key, endKey) + } + return nil +} + // evalCtx stored the current state of the environment of a running // script. type evalCtx struct { diff --git a/pkg/storage/mvcc_incremental_iterator.go b/pkg/storage/mvcc_incremental_iterator.go index 0beb1b464ed5..a199c7329cd9 100644 --- a/pkg/storage/mvcc_incremental_iterator.go +++ b/pkg/storage/mvcc_incremental_iterator.go @@ -485,6 +485,21 @@ func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey { return i.iter.UnsafeKey() } +// HasPointAndRange implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (i *MVCCIncrementalIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} + // UnsafeValue returns the same value as Value, but the memory is invalidated on // the next call to {Next,Reset,Close}. func (i *MVCCIncrementalIterator) UnsafeValue() []byte { diff --git a/pkg/storage/mvcc_key.go b/pkg/storage/mvcc_key.go index 02cb4016ba92..9c85f944b825 100644 --- a/pkg/storage/mvcc_key.go +++ b/pkg/storage/mvcc_key.go @@ -173,6 +173,12 @@ func encodeMVCCKeyToBuf(buf []byte, key MVCCKey, keyLen int) { } } +// encodeMVCCKeyPrefix encodes an MVCC user key (without timestamp) into its +// Pebble prefix representation. +func encodeMVCCKeyPrefix(key roachpb.Key) []byte { + return EncodeMVCCKey(MVCCKey{Key: key}) +} + // encodeMVCCTimestamp encodes an MVCC timestamp into its Pebble // representation, excluding length suffix and sentinel byte. func encodeMVCCTimestamp(ts hlc.Timestamp) []byte { @@ -287,3 +293,66 @@ func decodeMVCCTimestampSuffix(encodedTS []byte) (hlc.Timestamp, error) { } return decodeMVCCTimestamp(encodedTS[:encodedLen-1]) } + +// MVCCRangeKey is a versioned key span. +type MVCCRangeKey struct { + StartKey roachpb.Key + EndKey roachpb.Key + Timestamp hlc.Timestamp +} + +// Clone returns a copy of the range key. +func (k MVCCRangeKey) Clone() MVCCRangeKey { + // k is already a copy, but byte slices must be cloned. + k.StartKey = k.StartKey.Clone() + k.EndKey = k.EndKey.Clone() + return k +} + +// Compare returns -1 if this key is less than the given key, 0 if they're +// equal, or 1 if the given key is greater than this. Comparison is by +// start,timestamp,end, where larger timestamps sort before smaller ones except +// empty ones which sort first (like elsewhere in MVCC). +func (k MVCCRangeKey) Compare(o MVCCRangeKey) int { + if c := k.StartKey.Compare(o.StartKey); c != 0 { + return c + } + if k.Timestamp.IsEmpty() && !o.Timestamp.IsEmpty() { + return -1 + } else if !k.Timestamp.IsEmpty() && o.Timestamp.IsEmpty() { + return 1 + } else if c := k.Timestamp.Compare(o.Timestamp); c != 0 { + return -c // timestamps sort in reverse + } + return k.EndKey.Compare(o.EndKey) +} + +// String formats the range key. +func (k MVCCRangeKey) String() string { + s := roachpb.Span{Key: k.StartKey, EndKey: k.EndKey}.String() + if !k.Timestamp.IsEmpty() { + s += fmt.Sprintf("/%s", k.Timestamp) + } + return s +} + +// Validate returns an error if the range key is invalid. +func (k MVCCRangeKey) Validate() (err error) { + return errors.Wrapf(k.validate(), "invalid range key %s", k) +} + +func (k MVCCRangeKey) validate() error { + if k.StartKey == nil { + return errors.Errorf("no start key") + } + if k.EndKey == nil { + return errors.Errorf("no end key") + } + if k.Timestamp.IsEmpty() { + return errors.Errorf("no timestamp") + } + if k.StartKey.Compare(k.EndKey) > 0 { + return errors.Errorf("start key %s is after end key %s", k.StartKey, k.EndKey) + } + return nil +} diff --git a/pkg/storage/mvcc_key_test.go b/pkg/storage/mvcc_key_test.go index 63af9beda842..8344aca31219 100644 --- a/pkg/storage/mvcc_key_test.go +++ b/pkg/storage/mvcc_key_test.go @@ -238,3 +238,94 @@ func BenchmarkDecodeMVCCKey(b *testing.B) { } benchmarkDecodeMVCCKeyResult = mvccKey // avoid compiler optimizing away function call } + +func TestMVCCRangeKeyString(t *testing.T) { + defer leaktest.AfterTest(t)() + + testcases := map[string]struct { + rk MVCCRangeKey + expect string + }{ + "empty": {MVCCRangeKey{}, "/Min"}, + "only start": {MVCCRangeKey{StartKey: roachpb.Key("foo")}, "foo"}, + "only end": {MVCCRangeKey{EndKey: roachpb.Key("foo")}, "{/Min-foo}"}, + "only timestamp": {MVCCRangeKey{Timestamp: hlc.Timestamp{Logical: 1}}, "/Min/0,1"}, + "only span": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z")}, "{a-z}"}, + "all": {MVCCRangeKey{StartKey: roachpb.Key("a"), EndKey: roachpb.Key("z"), Timestamp: hlc.Timestamp{Logical: 1}}, "{a-z}/0,1"}, + "all overlapping": {MVCCRangeKey{StartKey: roachpb.Key("ab"), EndKey: roachpb.Key("af"), Timestamp: hlc.Timestamp{Logical: 1}}, "a{b-f}/0,1"}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.rk.String()) + }) + } +} + +func TestMVCCRangeKeyCompare(t *testing.T) { + defer leaktest.AfterTest(t)() + + ab1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("b"), hlc.Timestamp{Logical: 1}} + ac1 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + ac2 := MVCCRangeKey{roachpb.Key("a"), roachpb.Key("c"), hlc.Timestamp{Logical: 2}} + bc0 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 0}} + bc1 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 1}} + bc3 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("c"), hlc.Timestamp{Logical: 3}} + bd4 := MVCCRangeKey{roachpb.Key("b"), roachpb.Key("d"), hlc.Timestamp{Logical: 4}} + + testcases := map[string]struct { + a MVCCRangeKey + b MVCCRangeKey + expect int + }{ + "equal": {ac1, ac1, 0}, + "start lt": {ac1, bc1, -1}, + "start gt": {bc1, ac1, 1}, + "end lt": {ab1, ac1, -1}, + "end gt": {ac1, ab1, 1}, + "time lt": {ac2, ac1, -1}, // MVCC timestamps sort in reverse order + "time gt": {ac1, ac2, 1}, // MVCC timestamps sort in reverse order + "empty time lt set": {bc0, bc1, -1}, // empty MVCC timestamps sort before non-empty + "set time gt empty": {bc1, bc0, 1}, // empty MVCC timestamps sort before non-empty + "start time precedence": {ac2, bc3, -1}, + "time end precedence": {bd4, bc3, -1}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.a.Compare(tc.b)) + }) + } +} + +func TestMVCCRangeKeyValidate(t *testing.T) { + defer leaktest.AfterTest(t)() + + a := roachpb.Key("a") + b := roachpb.Key("b") + blank := roachpb.Key("") + ts1 := hlc.Timestamp{Logical: 1} + + testcases := map[string]struct { + rangeKey MVCCRangeKey + expectErr string // empty if no error + }{ + "valid": {MVCCRangeKey{StartKey: a, EndKey: b, Timestamp: ts1}, ""}, + "start at end": {MVCCRangeKey{StartKey: a, EndKey: a, Timestamp: ts1}, ""}, + "blank keys": {MVCCRangeKey{StartKey: blank, EndKey: blank, Timestamp: ts1}, ""}, // equivalent to MinKey + "no start": {MVCCRangeKey{EndKey: b, Timestamp: ts1}, "{/Min-b}/0,1: no start key"}, + "no end": {MVCCRangeKey{StartKey: a, Timestamp: ts1}, "a/0,1: no end key"}, + "no timestamp": {MVCCRangeKey{StartKey: a, EndKey: b}, "{a-b}: no timestamp"}, + "empty": {MVCCRangeKey{}, "/Min: no start key"}, + "end before start": {MVCCRangeKey{StartKey: b, EndKey: a, Timestamp: ts1}, `{b-a}/0,1: start key "b" is after end key "a"`}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + err := tc.rangeKey.Validate() + if tc.expectErr == "" { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Contains(t, err.Error(), tc.expectErr) + } + }) + } +} diff --git a/pkg/storage/mvcc_range_key_iterator.go b/pkg/storage/mvcc_range_key_iterator.go new file mode 100644 index 000000000000..1ba4ae3d2c42 --- /dev/null +++ b/pkg/storage/mvcc_range_key_iterator.go @@ -0,0 +1,269 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "bytes" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/errors" +) + +// MVCCRangeKeyIterOptions are options for an MVCCRangeKeyIterator. +type MVCCRangeKeyIterOptions struct { + // LowerBound sets the inclusive lower bound of the iterator. Range keys that + // straddle the bound will have their start key truncated to it. + // + // NB: It may be tempting to use an MVCCKey here and include a timestamp, but + // this would be pointless: giving e.g. a@4 would skip a range key starting at + // a@5, but the range key would logically exist at the adjacent a.Next()@5 so + // it would be emitted almost immediately anyway. + LowerBound roachpb.Key + // UpperBound sets the exclusive upper bound of the iterator. Range keys that + // straddle the upper bound will have their end key truncated to it. + UpperBound roachpb.Key + // MinTimestamp sets the inclusive lower timestamp bound for the iterator. + MinTimestamp hlc.Timestamp + // MaxTimestamp sets the inclusive upper timestamp bound for the iterator. + MaxTimestamp hlc.Timestamp + // Fragmented disables defragmentation, emitting non-deterministic fragments + // like SimpleMVCCIterator does. When enabled, this results in an iteration + // order of StartKey,Timestamp rather than EndKey,Timestamp. + Fragmented bool +} + +// MVCCRangeKeyIterator is an iterator over range keys in an engine. Unlike +// SimpleMVCCIterator, range key are defragmented into contiguous deterministic +// range keys. It does not support seeking or backtracking, see +// MVCCRangeKeyIterOptions for lower/upper bounds and other options. +// +// Iteration is in EndKey,Timestamp order rather than StartKey,Timestamp. For +// example: [c-e)@2, [a-z)@3, [x-z)@1. This is a memory optimization when +// defragmenting, which allows emitting completed range keys as soon as +// possible, only buffering incomplete ones in memory. To emit in +// StartKey,Timestamp order, we would additionally need to buffer all complete +// range keys that start after the current incomplete ones -- in the worst case, +// a range key across the entire key span would require all other range keys to +// be buffered in memory. But see the Fragmented option to emit +// non-deterministic range key fragments in StartKey,Timestamp order. +type MVCCRangeKeyIterator struct { + iter MVCCIterator + opts MVCCRangeKeyIterOptions + incomplete []*MVCCRangeKeyValue // defragmentation buffer + complete []MVCCRangeKeyValue // queued for emission + completeIdx int // current Key() + err error +} + +// NewMVCCRangeKeyIterator sets up a new MVCCRangeKeyIterator and seeks to the +// first range key. The caller must call Close() when done. +func NewMVCCRangeKeyIterator(r Reader, opts MVCCRangeKeyIterOptions) *MVCCRangeKeyIterator { + iter := &MVCCRangeKeyIterator{ + iter: r.NewMVCCIterator(MVCCKeyIterKind, IterOptions{ + KeyTypes: IterKeyTypeRangesOnly, + LowerBound: opts.LowerBound, + UpperBound: opts.UpperBound, + // TODO(erikgrinaker): We do not set Min/MaxTimestampHint here, because + // both are required and it's apparently not always safe to use. + }), + opts: opts, + incomplete: make([]*MVCCRangeKeyValue, 0), + complete: make([]MVCCRangeKeyValue, 0), + } + + // Seek the iterator to the lower bound and iterate until we've collected + // the first complete range key (if any). + iter.iter.SeekGE(MVCCKey{Key: opts.LowerBound}) + iter.findCompleteRangeKeys() + + return iter +} + +// findCompleteRangeKeys defragments range keys at the current iterator position +// and any subsequent iterator positions until it completes one or more range +// keys, populating p.complete. Current p.complete is discarded. +func (p *MVCCRangeKeyIterator) findCompleteRangeKeys() { + p.complete = p.complete[:0] + p.completeIdx = 0 + p.updateRangeKeys() + + for len(p.complete) == 0 { + if ok, err := p.iter.Valid(); err != nil { + p.err = err + return + } else if !ok { + break + } + p.iter.Next() + // NB: We update range keys even if Next() invalidates the iterator, because + // there may be incomplete range keys that become complete when the iterator + // is exhausted. + p.updateRangeKeys() + } +} + +// updateRangeKeys inspects the range keys at the current Pebble iterator +// position, defragments them in p.incomplete, and moves any completed +// range keys into p.complete. +func (p *MVCCRangeKeyIterator) updateRangeKeys() { + var startKey, endKey roachpb.Key + var rangeKeys []MVCCRangeKeyValue + + // If the iterator is exhausted, we still want to complete any remaining + // incomplete range keys. + if ok, err := p.iter.Valid(); err != nil { + p.err = err + return + } else if ok { + startKey, endKey = p.iter.RangeBounds() + rangeKeys = p.iter.RangeKeys() + } + + // Both rangeKeys and p.incomplete are sorted in descending timestamp order, + // so we iterate over them in lockstep and insert/update/delete p.incomplete + // as appropriate. + var tsIdx, rkIdx int + + for rkIdx < len(rangeKeys) { + rangeKey := rangeKeys[rkIdx] + + // Filter rangekeys by timestamp. + // + // TODO(erikgrinaker): This can be optimized to skip unnecessary comparisons + // since rangeKeys is sorted by timestamp. Maybe later. + if !p.opts.MinTimestamp.IsEmpty() && rangeKey.Key.Timestamp.Less(p.opts.MinTimestamp) { + rkIdx++ + continue + } + if !p.opts.MaxTimestamp.IsEmpty() && p.opts.MaxTimestamp.Less(rangeKey.Key.Timestamp) { + rkIdx++ + continue + } + + // If we're at the end of p.incomplete, this range key must be new. + if tsIdx >= len(p.incomplete) { + p.incomplete = append(p.incomplete, &MVCCRangeKeyValue{ + Key: MVCCRangeKey{ + StartKey: append(make([]byte, 0, len(startKey)), startKey...), + EndKey: append(make([]byte, 0, len(endKey)), endKey...), + Timestamp: rangeKey.Key.Timestamp, + }, + Value: append(make([]byte, 0, len(rangeKey.Value)), rangeKey.Value...), + }) + rkIdx++ + tsIdx++ + continue + } + + incomplete := p.incomplete[tsIdx] + cmp := incomplete.Key.Timestamp.Compare(rangeKey.Key.Timestamp) + switch { + // If the timestamps match, the key spans are adjacent or overlapping, and + // the values match then this range key extends the incomplete one. + case cmp == 0 && bytes.Compare(startKey, incomplete.Key.EndKey) <= 0 && + bytes.Equal(rangeKey.Value, incomplete.Value): + incomplete.Key.EndKey = append(incomplete.Key.EndKey[:0], endKey...) + tsIdx++ + rkIdx++ + + // This is a different range key at the same timestamp: complete the + // existing one and start a new one. + case cmp == 0: + p.complete = append(p.complete, *incomplete) + // NB: can't reuse slices, as they were placed in the completed range key. + incomplete.Key.StartKey = append(make([]byte, 0, len(startKey)), startKey...) + incomplete.Key.EndKey = append(make([]byte, 0, len(endKey)), endKey...) + incomplete.Value = append(make([]byte, 0, len(rangeKey.Value)), rangeKey.Value...) + + // This incomplete range key is not present at this range key: complete it + // and remove it from the list, then try again. + case cmp == 1: + p.complete = append(p.complete, *incomplete) + p.incomplete = append(p.incomplete[:tsIdx], p.incomplete[tsIdx+1:]...) + + // This range key is a new incomplete range key: start defragmenting it. + case cmp == -1: + p.incomplete = append(p.incomplete[:tsIdx+1], p.incomplete[tsIdx:]...) + p.incomplete[tsIdx] = &MVCCRangeKeyValue{ + Key: MVCCRangeKey{ + StartKey: append(make(roachpb.Key, 0, len(startKey)), startKey...), + EndKey: append(make(roachpb.Key, 0, len(endKey)), endKey...), + Timestamp: rangeKey.Key.Timestamp, + }, + Value: append(make([]byte, 0, len(rangeKey.Value)), rangeKey.Value...), + } + tsIdx++ + rkIdx++ + + default: + p.err = errors.Errorf("unexpected comparison result %d", cmp) + return + } + } + + // If the caller has requested fragments, we complete all range keys we found + // this iteration by resetting tsIdx to 0. The loop below handles the rest. + if p.opts.Fragmented { + tsIdx = 0 + } + + // If there are any remaining incomplete range keys, they must be complete: + // make them so. + for _, ts := range p.incomplete[tsIdx:] { + p.complete = append(p.complete, *ts) + } + p.incomplete = p.incomplete[:tsIdx] +} + +// Close frees up resources held by the iterator. +func (p *MVCCRangeKeyIterator) Close() { + p.iter.Close() + p.complete = nil + p.completeIdx = 0 +} + +// Next iterates to the next defragmented range key. Note the unusual iteration +// order, see struct comment for details. +func (p *MVCCRangeKeyIterator) Next() { + p.completeIdx++ + if p.completeIdx >= len(p.complete) { + p.iter.Next() + // NB: Called even if Next() fails, because we may have incomplete + // range keys that become complete when the iterator is exhausted. + p.findCompleteRangeKeys() + } +} + +// Key returns the current range key. It will not be invalidated by the +// iterator, but it will be shared by all callers. +func (p *MVCCRangeKeyIterator) Key() MVCCRangeKey { + return p.complete[p.completeIdx].Key +} + +// Value returns the value of the current range key. It will not be invalidated +// by the iterator, but it will be shared by all callers. +func (p *MVCCRangeKeyIterator) Value() []byte { + return p.complete[p.completeIdx].Value +} + +// Valid returns (true, nil) if the iterator points to a valid key, (false, nil) +// if the iterator is exhausted, or (false, error) if an error occurred during +// iteration. +func (p *MVCCRangeKeyIterator) Valid() (bool, error) { + if p.err != nil { + return false, p.err + } + if _, err := p.iter.Valid(); err != nil { + return false, err + } + return p.completeIdx < len(p.complete), nil +} diff --git a/pkg/storage/mvcc_range_key_iterator_test.go b/pkg/storage/mvcc_range_key_iterator_test.go new file mode 100644 index 000000000000..329de972fb35 --- /dev/null +++ b/pkg/storage/mvcc_range_key_iterator_test.go @@ -0,0 +1,292 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "testing" + + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" +) + +func TestMVCCRangeKeyIterator(t *testing.T) { + defer leaktest.AfterTest(t)() + + eng := NewDefaultInMemForTesting() + defer eng.Close() + + rangeKeys := []MVCCRangeKeyValue{ + rangeKV("b", "c", 3, "bc3"), + rangeKV("e", "g", 3, "eg3"), + rangeKV("d", "f", 5, "df5"), + rangeKV("f", "g", 5, "fg5"), + rangeKV("d", "f", 2, "df2"), + rangeKV("a", "m", 4, "az4"), // same value as below so these should merge into one + rangeKV("m", "z", 4, "az4"), + } + for _, rk := range rangeKeys { + require.NoError(t, eng.ExperimentalPutMVCCRangeKey(rk.Key, rk.Value)) + } + + testcases := map[string]struct { + opts MVCCRangeKeyIterOptions + expect []MVCCRangeKeyValue + }{ + "all range keys": { + MVCCRangeKeyIterOptions{}, + []MVCCRangeKeyValue{ + rangeKV("b", "c", 3, "bc3"), + rangeKV("d", "f", 5, "df5"), + rangeKV("d", "f", 2, "df2"), + rangeKV("f", "g", 5, "fg5"), + rangeKV("e", "g", 3, "eg3"), + rangeKV("a", "z", 4, "az4"), + }}, + "truncated range keys": { + MVCCRangeKeyIterOptions{ + LowerBound: roachpb.Key("c"), + UpperBound: roachpb.Key("e"), + }, + []MVCCRangeKeyValue{ + rangeKV("d", "e", 5, "df5"), + rangeKV("c", "e", 4, "az4"), + rangeKV("d", "e", 2, "df2"), + }}, + "truncation between range key bounds": { + MVCCRangeKeyIterOptions{ + LowerBound: roachpb.Key("ccc"), + UpperBound: roachpb.Key("eee"), + }, + []MVCCRangeKeyValue{ + rangeKV("d", "eee", 5, "df5"), + rangeKV("ccc", "eee", 4, "az4"), + rangeKV("e", "eee", 3, "eg3"), + rangeKV("d", "eee", 2, "df2"), + }}, + "fragmented range keys": { + MVCCRangeKeyIterOptions{ + Fragmented: true, + }, + []MVCCRangeKeyValue{ + rangeKV("a", "b", 4, "az4"), + rangeKV("b", "c", 4, "az4"), + rangeKV("b", "c", 3, "bc3"), + rangeKV("c", "d", 4, "az4"), + rangeKV("d", "e", 5, "df5"), + rangeKV("d", "e", 4, "az4"), + rangeKV("d", "e", 2, "df2"), + rangeKV("e", "f", 5, "df5"), + rangeKV("e", "f", 4, "az4"), + rangeKV("e", "f", 3, "eg3"), + rangeKV("e", "f", 2, "df2"), + rangeKV("f", "g", 5, "fg5"), + rangeKV("f", "g", 4, "az4"), + rangeKV("f", "g", 3, "eg3"), + rangeKV("g", "m", 4, "az4"), + rangeKV("m", "z", 4, "az4"), + }}, + "empty interval": { + MVCCRangeKeyIterOptions{ + LowerBound: roachpb.Key("A"), + UpperBound: roachpb.Key("Z"), + }, + nil}, + "zero-length interval": { + MVCCRangeKeyIterOptions{ + LowerBound: roachpb.Key("c"), + UpperBound: roachpb.Key("c"), + }, + nil}, + "end after start": { + MVCCRangeKeyIterOptions{ + LowerBound: roachpb.Key("e"), + UpperBound: roachpb.Key("d"), + }, + nil}, + "min timestamp": { + MVCCRangeKeyIterOptions{ + MinTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKeyValue{ + rangeKV("b", "c", 3, "bc3"), + rangeKV("d", "f", 5, "df5"), + rangeKV("f", "g", 5, "fg5"), + rangeKV("e", "g", 3, "eg3"), + rangeKV("a", "z", 4, "az4"), + }}, + "max timestamp": { + MVCCRangeKeyIterOptions{ + MaxTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKeyValue{ + rangeKV("b", "c", 3, "bc3"), + rangeKV("d", "f", 2, "df2"), + rangeKV("e", "g", 3, "eg3"), + }}, + "both timestamps": { + MVCCRangeKeyIterOptions{ + MinTimestamp: hlc.Timestamp{Logical: 3}, + MaxTimestamp: hlc.Timestamp{Logical: 3}, + }, + []MVCCRangeKeyValue{ + rangeKV("b", "c", 3, "bc3"), + rangeKV("e", "g", 3, "eg3"), + }}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + opts := tc.opts + if opts.UpperBound == nil { + opts.UpperBound = keys.MaxKey // appease pebbleIterator + } + iter := NewMVCCRangeKeyIterator(eng, opts) + defer iter.Close() + + var rangeKVs []MVCCRangeKeyValue + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + rangeKVs = append(rangeKVs, MVCCRangeKeyValue{ + Key: iter.Key(), + Value: iter.Value(), + }) + iter.Next() + } + require.Equal(t, tc.expect, rangeKVs) + }) + } +} + +// TestMVCCRangeKeyIteratorTimestampBounds tests that MVCCRangeKeyIterator +// returns appropriate range bounds, even in corner cases where Pebble may +// return bounds with timestamps (if the versions for a key are split across SST +// boundaries). +func TestMVCCIRangeKeyteratorTimestampBounds(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + eng := NewDefaultInMemForTesting() + defer eng.Close() + db := eng.(*Pebble).db + + // First, just set up some regular old range keys. + require.NoError(t, eng.ExperimentalPutMVCCRangeKey(rangeKey("a", "z", 1), []byte("az1"))) + require.NoError(t, eng.ExperimentalPutMVCCRangeKey(rangeKey("b", "d", 4), []byte("bd4"))) + require.NoError(t, eng.ExperimentalPutMVCCRangeKey(rangeKey("e", "g", 3), []byte("eg3"))) + + // Then, write a range key with suffix bounds. The range key will be + // [b-e)@5=be5, but we'll write it with artificial fragment bounds + // [b-b@5), [b@5-d@2), [d@2-e) + require.NoError(t, db.Experimental().RangeKeySet( // [b-b@5) + EncodeMVCCKey(pointKey("b", 0)), + EncodeMVCCKey(pointKey("b", 5)), + encodeMVCCTimestampSuffix(hlc.Timestamp{Logical: 5}), + []byte("be5"), + nil, + )) + require.NoError(t, db.Experimental().RangeKeySet( // [b@5-d@2) + EncodeMVCCKey(pointKey("b", 5)), + EncodeMVCCKey(pointKey("d", 2)), + encodeMVCCTimestampSuffix(hlc.Timestamp{Logical: 5}), + []byte("be5"), + nil, + )) + require.NoError(t, db.Experimental().RangeKeySet( // [d@2-e) + EncodeMVCCKey(pointKey("d", 2)), + EncodeMVCCKey(pointKey("e", 0)), + encodeMVCCTimestampSuffix(hlc.Timestamp{Logical: 5}), + []byte("be5"), + nil, + )) + + // Scan the fragmented range keys. + iter := NewMVCCRangeKeyIterator(eng, MVCCRangeKeyIterOptions{ + Fragmented: true, + UpperBound: keys.MaxKey, + }) + defer iter.Close() + + var actual []MVCCRangeKeyValue + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + actual = append(actual, MVCCRangeKeyValue{Key: iter.Key(), Value: iter.Value()}) + iter.Next() + } + require.Equal(t, []MVCCRangeKeyValue{ + rangeKV("a", "b", 1, "az1"), + rangeKV("b", "b\x00", 5, "be5"), + rangeKV("b", "b\x00", 4, "bd4"), + rangeKV("b", "b\x00", 1, "az1"), + rangeKV("b", "d", 5, "be5"), + rangeKV("b", "d", 4, "bd4"), + rangeKV("b", "d", 1, "az1"), + rangeKV("d", "d\x00", 5, "be5"), + rangeKV("d", "d\x00", 1, "az1"), + rangeKV("d", "e", 5, "be5"), + rangeKV("d", "e", 1, "az1"), + rangeKV("e", "g", 3, "eg3"), + rangeKV("e", "g", 1, "az1"), + rangeKV("g", "z", 1, "az1"), + }, actual) + + // Scan the defragmented range keys. + iter = NewMVCCRangeKeyIterator(eng, MVCCRangeKeyIterOptions{ + UpperBound: keys.MaxKey, + }) + defer iter.Close() + + actual = nil + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + actual = append(actual, MVCCRangeKeyValue{Key: iter.Key(), Value: iter.Value()}) + iter.Next() + } + require.Equal(t, []MVCCRangeKeyValue{ + rangeKV("b", "d", 4, "bd4"), + rangeKV("b", "e", 5, "be5"), + rangeKV("e", "g", 3, "eg3"), + rangeKV("a", "z", 1, "az1"), + }, actual) +} + +func rangeKey(start, end string, ts int) MVCCRangeKey { + return MVCCRangeKey{ + StartKey: roachpb.Key(start), + EndKey: roachpb.Key(end), + Timestamp: hlc.Timestamp{Logical: int32(ts)}, + } +} + +func rangeKV(start, end string, ts int, value string) MVCCRangeKeyValue { + return MVCCRangeKeyValue{ + Key: rangeKey(start, end, ts), + Value: []byte(value), + } +} + +func pointKey(key string, ts int) MVCCKey { + return MVCCKey{Key: roachpb.Key(key), Timestamp: hlc.Timestamp{Logical: int32(ts)}} +} diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 02e0bbdc8cbb..850a976205ea 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -481,6 +481,8 @@ func DefaultPebbleOptions() *pebble.Options { TablePropertyCollectors: PebbleTablePropertyCollectors, BlockPropertyCollectors: PebbleBlockPropertyCollectors, } + // Used for experimental MVCC range tombstones. + opts.Experimental.RangeKeys = new(pebble.RangeKeysArena) // Automatically flush 10s after the first range tombstone is added to a // memtable. This ensures that we can reclaim space even when there's no // activity on the database generating flushes. @@ -1108,6 +1110,31 @@ func (p *Pebble) ClearIterRange(iter MVCCIterator, start, end roachpb.Key) error return batch.Commit(true) } +// ExperimentalClearMVCCRangeKey implements the Engine interface. +func (p *Pebble) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset( + encodeMVCCKeyPrefix(rangeKey.StartKey), + encodeMVCCKeyPrefix(rangeKey.EndKey), + encodeMVCCTimestampSuffix(rangeKey.Timestamp), + pebble.Sync) +} + +// ExperimentalPutMVCCRangeKey implements the Engine interface. +func (p *Pebble) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeySet( + encodeMVCCKeyPrefix(rangeKey.StartKey), + encodeMVCCKeyPrefix(rangeKey.EndKey), + encodeMVCCTimestampSuffix(rangeKey.Timestamp), + value, + pebble.Sync) +} + // Merge implements the Engine interface. func (p *Pebble) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { @@ -1882,6 +1909,14 @@ func (p *pebbleReadOnly) ClearIterRange(iter MVCCIterator, start, end roachpb.Ke panic("not implemented") } +func (p *pebbleReadOnly) ExperimentalPutMVCCRangeKey(_ MVCCRangeKey, _ []byte) error { + panic("not implemented") +} + +func (p *pebbleReadOnly) ExperimentalClearMVCCRangeKey(_ MVCCRangeKey) error { + panic("not implemented") +} + func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error { panic("not implemented") } diff --git a/pkg/storage/pebble_batch.go b/pkg/storage/pebble_batch.go index 117c2995cfa8..cce85cfaecc2 100644 --- a/pkg/storage/pebble_batch.go +++ b/pkg/storage/pebble_batch.go @@ -413,6 +413,31 @@ func (p *pebbleBatch) ClearIterRange(iter MVCCIterator, start, end roachpb.Key) return nil } +// ExperimentalClearMVCCRangeKey implements the Engine interface. +func (p *pebbleBatch) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeyUnset( + encodeMVCCKeyPrefix(rangeKey.StartKey), + encodeMVCCKeyPrefix(rangeKey.EndKey), + encodeMVCCTimestampSuffix(rangeKey.Timestamp), + nil) +} + +// ExperimentalPutMVCCRangeKey implements the Batch interface. +func (p *pebbleBatch) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + if err := rangeKey.Validate(); err != nil { + return err + } + return p.db.Experimental().RangeKeySet( + encodeMVCCKeyPrefix(rangeKey.StartKey), + encodeMVCCKeyPrefix(rangeKey.EndKey), + encodeMVCCTimestampSuffix(rangeKey.Timestamp), + value, + nil) +} + // Merge implements the Batch interface. func (p *pebbleBatch) Merge(key MVCCKey, value []byte) error { if len(key.Key) == 0 { diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index cc4a5ebe502e..87a24a1c6114 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" @@ -166,6 +167,8 @@ func (p *pebbleIterator) init(handle pebble.Reader, iterToClone cloneableIter, o panic("min timestamp hint set without max timestamp hint") } + p.options.KeyTypes = opts.KeyTypes + if doClone { var err error if p.iter, err = iterToClone.Clone(); err != nil { @@ -332,6 +335,17 @@ func (p *pebbleIterator) Valid() (bool, error) { // NB: A Pebble Iterator always returns Valid()==false when an error is // present. If Valid() is true, there is no error. if ok := p.iter.Valid(); ok { + // TODO(erikgrinaker): Pebble does not seem to respect bounds for range + // keys, so for now we invalidate the iterator here instead. + if hasPoint, hasRange := p.HasPointAndRange(); !hasPoint && hasRange { + start, end := p.iter.RangeBounds() + if len(p.options.UpperBound) > 0 && bytes.Compare(start, p.options.UpperBound) >= 0 { + return false, nil + } + if len(p.options.LowerBound) > 0 && bytes.Compare(end, p.options.LowerBound) <= 0 { + return false, nil + } + } // The MVCCIterator interface is broken in that it silently discards // the error when UnsafeKey(), Key() are unable to parse the key as // an MVCCKey. This is especially problematic if the caller is @@ -575,6 +589,88 @@ func (p *pebbleIterator) ValueProto(msg protoutil.Message) error { return protoutil.Unmarshal(value, msg) } +// HasPointAndRange implements the MVCCIterator interface. +func (p *pebbleIterator) HasPointAndRange() (bool, bool) { + return p.iter.HasPointAndRange() +} + +// RangeBounds implements the MVCCIterator interface. +func (p *pebbleIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + start, end := p.iter.RangeBounds() + + // TODO(erikgrinaker): Pebble does not yet truncate range keys to the + // LowerBound or UpperBound, so we truncate them here. + if len(p.options.LowerBound) > 0 && bytes.Compare(start, p.options.LowerBound) < 0 { + start = p.options.LowerBound + } + if len(p.options.UpperBound) > 0 && bytes.Compare(end, p.options.UpperBound) > 0 { + end = p.options.UpperBound + } + + // TODO(erikgrinaker): We should surface this error somehow, but for now + // we follow UnsafeKey()'s example and silenty return empty bounds. + startKey, err := DecodeMVCCKey(start) + if err != nil { + return nil, nil + } + endKey, err := DecodeMVCCKey(end) + if err != nil { + return nil, nil + } + + // It's possible for a range key to straddle an SST boundary in between two + // versions/timestamps of a key, e.g. [b-c@5),[c@5-e). In this case, Pebble + // will return bounds with timestamps. Because we don't allow writing range + // keys with timestamp bounds, we know that all timestamps of the key (on both + // sides of the boundary) already have the complete set of range key values at + // all timestamps. + // + // We choose the simplest solution here, which is to allow overlapping range + // bounds such that at c@6 we claim [b-c\0) and at c@5 we claim [c-e). It + // would be possible to guarantee non-overlapping bounds with additional state + // tracking and seeks, but that appears non-trivial for every edge case and + // has a risk of leaving gaps, which seems more scary than dealing with + // overlaps. + // + // TODO(erikgrinaker): Consider coming up with a solution to guarantee + // non-overlapping bounds. + if !startKey.Timestamp.IsEmpty() { + startKey.Timestamp = hlc.Timestamp{} + } + if !endKey.Timestamp.IsEmpty() { + endKey.Key = endKey.Key.Next() + endKey.Timestamp = hlc.Timestamp{} + } + return startKey.Key, endKey.Key +} + +// RangeKeys implements the MVCCIterator interface. +// +// TODO(erikgrinaker): Add unit tests for the range key methods. +func (p *pebbleIterator) RangeKeys() []MVCCRangeKeyValue { + startKey, endKey := p.RangeBounds() + rangeKeys := p.iter.RangeKeys() + rangeValues := make([]MVCCRangeKeyValue, 0, len(rangeKeys)) + + for _, rangeKey := range rangeKeys { + timestamp, err := decodeMVCCTimestampSuffix(rangeKey.Suffix) + if err != nil { + // TODO(erikgrinaker): We should surface this error somehow, but for now + // we follow UnsafeKey()'s example and silenty return empty bounds. + continue + } + rangeValues = append(rangeValues, MVCCRangeKeyValue{ + Key: MVCCRangeKey{ + StartKey: startKey, + EndKey: endKey, + Timestamp: timestamp, + }, + Value: rangeKey.Value, + }) + } + return rangeValues +} + // ComputeStats implements the MVCCIterator interface. func (p *pebbleIterator) ComputeStats( start, end roachpb.Key, nowNanos int64, diff --git a/pkg/storage/sst_iterator.go b/pkg/storage/sst_iterator.go index 9bd8b49b1b39..c5c81b756c37 100644 --- a/pkg/storage/sst_iterator.go +++ b/pkg/storage/sst_iterator.go @@ -158,3 +158,18 @@ func (r *sstIterator) UnsafeKey() MVCCKey { func (r *sstIterator) UnsafeValue() []byte { return r.value } + +// HasPointAndRange implements SimpleMVCCIterator. +func (r *sstIterator) HasPointAndRange() (bool, bool) { + panic("not implemented") +} + +// RangeBounds implements SimpleMVCCIterator. +func (r *sstIterator) RangeBounds() (roachpb.Key, roachpb.Key) { + panic("not implemented") +} + +// RangeKeys implements SimpleMVCCIterator. +func (r *sstIterator) RangeKeys() []MVCCRangeKeyValue { + panic("not implemented") +} diff --git a/pkg/storage/sst_writer.go b/pkg/storage/sst_writer.go index 9b58c8afebc1..3643360bf4c3 100644 --- a/pkg/storage/sst_writer.go +++ b/pkg/storage/sst_writer.go @@ -118,6 +118,16 @@ func (fw *SSTWriter) ClearMVCCRange(start, end MVCCKey) error { return fw.clearRange(start, end) } +// ExperimentalPutMVCCRangeKey implements the Writer interface. +func (fw *SSTWriter) ExperimentalPutMVCCRangeKey(rangeKey MVCCRangeKey, value []byte) error { + panic("not implemented") +} + +// ExperimentalClearMVCCRangeKey implements the Writer interface. +func (fw *SSTWriter) ExperimentalClearMVCCRangeKey(rangeKey MVCCRangeKey) error { + panic("not implemented") +} + func (fw *SSTWriter) clearRange(start, end MVCCKey) error { if fw.fw == nil { return errors.New("cannot call ClearRange on a closed writer") diff --git a/pkg/storage/testdata/mvcc_histories/delete_range_tombstone b/pkg/storage/testdata/mvcc_histories/delete_range_tombstone new file mode 100644 index 000000000000..a69416ee7163 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/delete_range_tombstone @@ -0,0 +1,111 @@ +# TODO(erikgrinaker): The MVCC API does not respect range tombstones yet, so +# we don't test point keys because they remain unaffected. +# TODO(erikgrinaker): This needs conflict tests, implement later. + +# Write some range tombstones. Some will abut and merge. +run ok +del_range_ts k=b end=c ts=3 +del_range_ts k=e end=g ts=3 +del_range_ts k=d end=f ts=5 +del_range_ts k=d end=f ts=2 +del_range_ts k=m end=z ts=1 +del_range_ts k=a end=m ts=4 +del_range_ts k=m end=z ts=4 +---- +del_range_ts: {b-c}/3.000000000,0 +del_range_ts: {e-g}/3.000000000,0 +del_range_ts: {d-f}/5.000000000,0 +del_range_ts: {d-f}/2.000000000,0 +del_range_ts: {m-z}/1.000000000,0 +del_range_ts: {a-m}/4.000000000,0 +del_range_ts: {m-z}/4.000000000,0 +>> at end: +range key: {b-c}/3.000000000,0 -> [] +range key: {d-f}/5.000000000,0 -> [] +range key: {d-f}/2.000000000,0 -> [] +range key: {e-g}/3.000000000,0 -> [] +range key: {a-z}/4.000000000,0 -> [] +range key: {m-z}/1.000000000,0 -> [] + +# Scan all tombstones. +run ok +scan_range_ts k=a end=z +---- +scan_range_ts: {b-c}/3.000000000,0 +scan_range_ts: {d-f}/5.000000000,0 +scan_range_ts: {d-f}/2.000000000,0 +scan_range_ts: {e-g}/3.000000000,0 +scan_range_ts: {a-z}/4.000000000,0 +scan_range_ts: {m-z}/1.000000000,0 + +# Scan truncates tombstones to scan bounds. +run ok +scan_range_ts k=c end=e +---- +scan_range_ts: {d-e}/5.000000000,0 +scan_range_ts: {c-e}/4.000000000,0 +scan_range_ts: {d-e}/2.000000000,0 + +# Scan truncates tombstones to scan bounds when not on tombstone bounds. +run ok +scan_range_ts k=ccc end=eee +---- +scan_range_ts: {d-eee}/5.000000000,0 +scan_range_ts: {ccc-eee}/4.000000000,0 +scan_range_ts: e{-ee}/3.000000000,0 +scan_range_ts: {d-eee}/2.000000000,0 + +# Scan at a timestamp. +run ok +scan_range_ts k=a end=z ts=3 +---- +scan_range_ts: {b-c}/3.000000000,0 +scan_range_ts: {d-f}/2.000000000,0 +scan_range_ts: {e-g}/3.000000000,0 +scan_range_ts: {m-z}/1.000000000,0 + +# Empty scans. +run ok +scan_range_ts k=A end=Z +scan_range_ts k=c end=c +scan_range_ts k=z end=a +---- +scan_range_ts: "A"-"Z" -> +scan_range_ts: "c"-"c" -> +scan_range_ts: "z"-"a" -> + +# Remove some tombstones, both a non-existant one and a span across two +# tombstones. +run ok +clear_range_key k=a end=z ts=10 +clear_range_key k=b end=g ts=3 +---- +>> at end: +range key: {d-f}/5.000000000,0 -> [] +range key: {d-f}/2.000000000,0 -> [] +range key: {a-z}/4.000000000,0 -> [] +range key: {m-z}/1.000000000,0 -> [] + +# Remove the middle section of the [a-z)@4 tombstone. Do it twice, to +# make sure clears are idempotent. +run ok +clear_range_key k=k end=n ts=4 +clear_range_key k=k end=n ts=4 +---- +>> at end: +range key: {d-f}/5.000000000,0 -> [] +range key: {d-f}/2.000000000,0 -> [] +range key: {a-k}/4.000000000,0 -> [] +range key: {n-z}/4.000000000,0 -> [] +range key: {m-z}/1.000000000,0 -> [] + +# Remove portions of the [a-k)@4 and [n-z)@4 tombstones in one operation. +run ok +clear_range_key k=eee end=ttt ts=4 +---- +>> at end: +range key: {a-eee}/4.000000000,0 -> [] +range key: {d-f}/5.000000000,0 -> [] +range key: {d-f}/2.000000000,0 -> [] +range key: {ttt-z}/4.000000000,0 -> [] +range key: {m-z}/1.000000000,0 -> [] diff --git a/pkg/util/hlc/timestamp.go b/pkg/util/hlc/timestamp.go index 0263ac1d3946..b84b39207fa6 100644 --- a/pkg/util/hlc/timestamp.go +++ b/pkg/util/hlc/timestamp.go @@ -54,6 +54,22 @@ func (t Timestamp) LessEq(s Timestamp) bool { return t.WallTime < s.WallTime || (t.WallTime == s.WallTime && t.Logical <= s.Logical) } +// Compare returns -1 if this timestamp is lesser than the given timestamp, 1 if +// it is greater, and 0 if they are equal. +func (t Timestamp) Compare(s Timestamp) int { + if t.WallTime > s.WallTime { + return 1 + } else if t.WallTime < s.WallTime { + return -1 + } else if t.Logical > s.Logical { + return 1 + } else if t.Logical < s.Logical { + return -1 + } else { + return 0 + } +} + // String implements the fmt.Stringer interface. func (t Timestamp) String() string { // The following code was originally written as diff --git a/pkg/util/hlc/timestamp_test.go b/pkg/util/hlc/timestamp_test.go index 59e45dde2a87..9ba6ffe81764 100644 --- a/pkg/util/hlc/timestamp_test.go +++ b/pkg/util/hlc/timestamp_test.go @@ -15,6 +15,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func makeTS(walltime int64, logical int32) Timestamp { @@ -85,6 +86,38 @@ func TestLessEq(t *testing.T) { } } +func TestCompare(t *testing.T) { + w0l0 := Timestamp{} + w1l1 := Timestamp{WallTime: 1, Logical: 1} + w1l2 := Timestamp{WallTime: 1, Logical: 2} + w2l1 := Timestamp{WallTime: 2, Logical: 1} + w2l2 := Timestamp{WallTime: 2, Logical: 2} + + testcases := map[string]struct { + a Timestamp + b Timestamp + expect int + }{ + "empty eq empty": {w0l0, w0l0, 0}, + "empty lt set": {w0l0, w1l1, -1}, + "set gt empty": {w1l1, w0l0, 1}, + "set eq set": {w1l1, w1l1, 0}, + + "wall lt": {w1l1, w2l1, -1}, + "wall gt": {w2l1, w1l1, 1}, + "logical lt": {w1l1, w1l2, -1}, + "logical gt": {w1l2, w1l1, 1}, + "both lt": {w1l1, w2l2, -1}, + "both gt": {w2l2, w1l1, 1}, + "wall precedence": {w2l1, w1l2, 1}, + } + for name, tc := range testcases { + t.Run(name, func(t *testing.T) { + require.Equal(t, tc.expect, tc.a.Compare(tc.b)) + }) + } +} + func TestIsEmpty(t *testing.T) { a := makeTS(0, 0) assert.True(t, a.IsEmpty())