diff --git a/pkg/ccl/storageccl/engineccl/BUILD.bazel b/pkg/ccl/storageccl/engineccl/BUILD.bazel index ed6ac50f7bee..ced2740e0ebd 100644 --- a/pkg/ccl/storageccl/engineccl/BUILD.bazel +++ b/pkg/ccl/storageccl/engineccl/BUILD.bazel @@ -43,6 +43,7 @@ go_test( "//pkg/base", "//pkg/ccl/baseccl", "//pkg/ccl/storageccl/engineccl/enginepbccl:enginepbccl_go_proto", + "//pkg/keys", "//pkg/roachpb", "//pkg/settings/cluster", "//pkg/storage", diff --git a/pkg/ccl/storageccl/engineccl/bench_test.go b/pkg/ccl/storageccl/engineccl/bench_test.go index 21205f84c8ea..c95540b25b3e 100644 --- a/pkg/ccl/storageccl/engineccl/bench_test.go +++ b/pkg/ccl/storageccl/engineccl/bench_test.go @@ -17,6 +17,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" @@ -153,7 +154,7 @@ func runIterate( } it := makeIterator(eng, startTime, endTime) defer it.Close() - for it.SeekGE(storage.MVCCKey{}); ; it.Next() { + for it.SeekGE(storage.MVCCKey{Key: keys.LocalMax}); ; it.Next() { if ok, err := it.Valid(); !ok { if err != nil { b.Fatal(err) diff --git a/pkg/ccl/storageccl/export_test.go b/pkg/ccl/storageccl/export_test.go index aadae5244c24..6f524fe98c2d 100644 --- a/pkg/ccl/storageccl/export_test.go +++ b/pkg/ccl/storageccl/export_test.go @@ -729,6 +729,7 @@ func TestRandomKeyAndTimestampExport(t *testing.T) { return keys, timestamps } + localMax := keys.LocalMax testWithTargetSize := func(t *testing.T, targetSize uint64) { e, cleanup := mkEngine(t) defer cleanup() @@ -736,7 +737,7 @@ func TestRandomKeyAndTimestampExport(t *testing.T) { numKeys := getNumKeys(t, rnd, targetSize) keys, timestamps := mkData(t, e, rnd, numKeys) var ( - keyMin = roachpb.KeyMin + keyMin = localMax keyMax = roachpb.KeyMax tsMin = hlc.Timestamp{WallTime: 0, Logical: 0} diff --git a/pkg/ccl/storageccl/import_test.go b/pkg/ccl/storageccl/import_test.go index 9b412a42da22..6661f2fe507c 100644 --- a/pkg/ccl/storageccl/import_test.go +++ b/pkg/ccl/storageccl/import_test.go @@ -68,7 +68,7 @@ func TestMaxImportBatchSize(t *testing.T) { func slurpSSTablesLatestKey( t *testing.T, dir string, paths []string, kr prefixRewriter, ) []storage.MVCCKeyValue { - start, end := storage.MVCCKey{Key: keys.MinKey}, storage.MVCCKey{Key: keys.MaxKey} + start, end := storage.MVCCKey{Key: keys.LocalMax}, storage.MVCCKey{Key: keys.MaxKey} e := storage.NewDefaultInMem() defer e.Close() @@ -111,9 +111,9 @@ func slurpSSTablesLatestKey( v := roachpb.Value{RawBytes: newKv.Value} v.ClearChecksum() v.InitChecksum(newKv.Key.Key) - // TODO(sumeer): this will not be correct with the separated - // lock table. We should iterate using EngineKey on the sst, - // and expose a PutEngine method to write directly. + // NB: import data does not contain intents, so data with no timestamps + // is inline meta and not intents. Therefore this is not affected by the + // choice of interleaved or separated intents. if newKv.Key.Timestamp.IsEmpty() { if err := batch.PutUnversioned(newKv.Key.Key, v.RawBytes); err != nil { t.Fatal(err) diff --git a/pkg/keys/constants.go b/pkg/keys/constants.go index 10f00d4a3788..d4f8c978accf 100644 --- a/pkg/keys/constants.go +++ b/pkg/keys/constants.go @@ -53,8 +53,8 @@ var ( localSuffixLength = 4 // There are five types of local key data enumerated below: replicated - // range-ID, unreplicated range-ID, range local, range lock, and - // store-local keys. + // range-ID, unreplicated range-ID, range local, store-local, and range lock + // keys. // 1. Replicated Range-ID keys // @@ -145,36 +145,10 @@ var ( // transaction records. The additional detail is the transaction id. LocalTransactionSuffix = roachpb.RKey("txn-") - // 4. Lock table keys + // 4. Store local keys // - // LocalRangeLockTablePrefix specifies the key prefix for the lock - // table. It is immediately followed by the LockTableSingleKeyInfix, - // and then the key being locked. - // - // The lock strength and txn UUID are not in the part of the key that - // the keys package deals with. They are in the versioned part of the - // key (see EngineKey.Version). This permits the storage engine to use - // bloom filters when searching for all locks for a lockable key. - // - // Different lock strengths may use different value types. The exclusive - // lock strength uses MVCCMetadata as the value type, since it does - // double duty as a reference to a provisional MVCC value. - // TODO(sumeer): remember to adjust this comment when adding locks of - // other strengths, or range locks. - LocalRangeLockTablePrefix = roachpb.Key(makeKey(localPrefix, roachpb.RKey("l"))) - LockTableSingleKeyInfix = []byte("k") - // LockTableSingleKeyStart is the inclusive start key of the key range - // containing single key locks. - LockTableSingleKeyStart = roachpb.Key(makeKey(LocalRangeLockTablePrefix, LockTableSingleKeyInfix)) - // LockTableSingleKeyEnd is the exclusive end key of the key range - // containing single key locks. - LockTableSingleKeyEnd = roachpb.Key( - makeKey(LocalRangeLockTablePrefix, roachpb.Key(LockTableSingleKeyInfix).PrefixEnd())) - - // 5. Store local keys - // - // localStorePrefix is the prefix identifying per-store data. - localStorePrefix = makeKey(localPrefix, roachpb.Key("s")) + // LocalStorePrefix is the prefix identifying per-store data. + LocalStorePrefix = makeKey(localPrefix, roachpb.Key("s")) // localStoreSuggestedCompactionSuffix stores suggested compactions to // be aggregated and processed on the store. localStoreSuggestedCompactionSuffix = []byte("comp") @@ -210,6 +184,32 @@ var ( // LocalStoreCachedSettingsKeyMax is the end of span of possible cached settings keys. LocalStoreCachedSettingsKeyMax = LocalStoreCachedSettingsKeyMin.PrefixEnd() + // 5. Lock table keys + // + // LocalRangeLockTablePrefix specifies the key prefix for the lock + // table. It is immediately followed by the LockTableSingleKeyInfix, + // and then the key being locked. + // + // The lock strength and txn UUID are not in the part of the key that + // the keys package deals with. They are in the versioned part of the + // key (see EngineKey.Version). This permits the storage engine to use + // bloom filters when searching for all locks for a lockable key. + // + // Different lock strengths may use different value types. The exclusive + // lock strength uses MVCCMetadata as the value type, since it does + // double duty as a reference to a provisional MVCC value. + // TODO(sumeer): remember to adjust this comment when adding locks of + // other strengths, or range locks. + LocalRangeLockTablePrefix = roachpb.Key(makeKey(localPrefix, roachpb.RKey("z"))) + LockTableSingleKeyInfix = []byte("k") + // LockTableSingleKeyStart is the inclusive start key of the key range + // containing single key locks. + LockTableSingleKeyStart = roachpb.Key(makeKey(LocalRangeLockTablePrefix, LockTableSingleKeyInfix)) + // LockTableSingleKeyEnd is the exclusive end key of the key range + // containing single key locks. + LockTableSingleKeyEnd = roachpb.Key( + makeKey(LocalRangeLockTablePrefix, roachpb.Key(LockTableSingleKeyInfix).PrefixEnd())) + // The global keyspace includes the meta{1,2}, system, system tenant SQL // keys, and non-system tenant SQL keys. diff --git a/pkg/keys/doc.go b/pkg/keys/doc.go index 1abc504b8bff..3147014c29a3 100644 --- a/pkg/keys/doc.go +++ b/pkg/keys/doc.go @@ -137,7 +137,9 @@ // be range local keys. If not, they're meant to be range-ID local keys. Any key // we need to re-write during splits/merges will needs to go through Raft. We // have limits set on the size of Raft proposals so we generally don’t want to -// be re-writing lots of data. +// be re-writing lots of data. Range lock keys (see below) are separate from +// range local keys, but behave similarly in that they split and merge along +// range boundaries. // // This naturally leads to range-id local keys being used to store metadata // about a specific Range and range local keys being used to store metadata @@ -157,8 +159,9 @@ var _ = [...]interface{}{ MinKey, // There are five types of local key data enumerated below: replicated - // range-ID, unreplicated range-ID, range local, range lock, and - // store-local keys. + // range-ID, unreplicated range-ID, range local, store-local, and range lock + // keys. Range lock keys are required to be last category of keys in the + // lock key space. // Local keys are constructed using a prefix, an optional infix, and a // suffix. The prefix and infix are used to disambiguate between the four // types of local keys listed above, and determines inter-group ordering. @@ -169,14 +172,14 @@ var _ = [...]interface{}{ // - RangeID unreplicated keys all share `LocalRangeIDPrefix` and // `localRangeIDUnreplicatedInfix`. // - Range local keys all share `LocalRangePrefix`. + // - Store keys all share `localStorePrefix`. // - Range lock (which are also local keys) all share // `LocalRangeLockTablePrefix`. - // - Store keys all share `localStorePrefix`. // - // `LocalRangeIDPrefix`, `localRangePrefix`, `LocalRangeLockTablePrefix`, - // and `localStorePrefix` all in turn share `localPrefix`. `localPrefix` was - // chosen arbitrarily. Local keys would work just as well with a different - // prefix, like 0xff, or even with a suffix. + // `LocalRangeIDPrefix`, `localRangePrefix`, `localStorePrefix`, and + // `LocalRangeLockTablePrefix` all in turn share `localPrefix`. + // `localPrefix` was chosen arbitrarily. Local keys would work just as well + // with a different prefix, like 0xff, or even with a suffix. // 1. Replicated range-ID local keys: These store metadata pertaining to a // range as a whole. Though they are replicated, they are unaddressable. @@ -210,17 +213,7 @@ var _ = [...]interface{}{ RangeDescriptorKey, // "rdsc" TransactionKey, // "txn-" - // 4. Range lock keys for all replicated locks. All range locks share - // LocalRangeLockTablePrefix. Locks can be acquired on global keys and on - // range local keys. Currently, locks are only on single keys, i.e., not - // on a range of keys. Only exclusive locks are currently supported, and - // these additionally function as pointers to the provisional MVCC values. - // Single key locks use a byte, LockTableSingleKeyInfix, that follows - // the LocalRangeLockTablePrefix. This is to keep the single-key locks - // separate from (future) range locks. - LockTableSingleKey, - - // 5. Store local keys: These contain metadata about an individual store. + // 4. Store local keys: These contain metadata about an individual store. // They are unreplicated and unaddressable. The typical example is the // store 'ident' record. They all share `localStorePrefix`. StoreClusterVersionKey, // "cver" @@ -231,6 +224,16 @@ var _ = [...]interface{}{ StoreLastUpKey, // "uptm" StoreCachedSettingsKey, // "stng" + // 5. Range lock keys for all replicated locks. All range locks share + // LocalRangeLockTablePrefix. Locks can be acquired on global keys and on + // range local keys. Currently, locks are only on single keys, i.e., not + // on a range of keys. Only exclusive locks are currently supported, and + // these additionally function as pointers to the provisional MVCC values. + // Single key locks use a byte, LockTableSingleKeyInfix, that follows + // the LocalRangeLockTablePrefix. This is to keep the single-key locks + // separate from (future) range locks. + LockTableSingleKey, + // The global keyspace includes the meta{1,2}, system, system tenant SQL // keys, and non-system tenant SQL keys. // diff --git a/pkg/keys/keys.go b/pkg/keys/keys.go index 0e4137ef9ca7..00e614bcfb88 100644 --- a/pkg/keys/keys.go +++ b/pkg/keys/keys.go @@ -27,8 +27,8 @@ func makeKey(keys ...[]byte) []byte { // MakeStoreKey creates a store-local key based on the metadata key // suffix, and optional detail. func MakeStoreKey(suffix, detail roachpb.RKey) roachpb.Key { - key := make(roachpb.Key, 0, len(localStorePrefix)+len(suffix)+len(detail)) - key = append(key, localStorePrefix...) + key := make(roachpb.Key, 0, len(LocalStorePrefix)+len(suffix)+len(detail)) + key = append(key, LocalStorePrefix...) key = append(key, suffix...) key = append(key, detail...) return key @@ -37,11 +37,11 @@ func MakeStoreKey(suffix, detail roachpb.RKey) roachpb.Key { // DecodeStoreKey returns the suffix and detail portions of a local // store key. func DecodeStoreKey(key roachpb.Key) (suffix, detail roachpb.RKey, err error) { - if !bytes.HasPrefix(key, localStorePrefix) { - return nil, nil, errors.Errorf("key %s does not have %s prefix", key, localStorePrefix) + if !bytes.HasPrefix(key, LocalStorePrefix) { + return nil, nil, errors.Errorf("key %s does not have %s prefix", key, LocalStorePrefix) } // Cut the prefix, the Range ID, and the infix specifier. - key = key[len(localStorePrefix):] + key = key[len(LocalStorePrefix):] if len(key) < localSuffixLength { return nil, nil, errors.Errorf("malformed key does not contain local store suffix") } @@ -481,12 +481,6 @@ func IsLocal(k roachpb.Key) bool { return bytes.HasPrefix(k, localPrefix) } -// IsLocalStoreKey performs a cheap check that returns true iff the parameter -// is a local store key. -func IsLocalStoreKey(k roachpb.Key) bool { - return bytes.HasPrefix(k, localStorePrefix) -} - // Addr returns the address for the key, used to lookup the range containing the // key. In the normal case, this is simply the key's value. However, for local // keys, such as transaction records, the address is the inner encoded key, with @@ -512,7 +506,7 @@ func Addr(k roachpb.Key) (roachpb.RKey, error) { } for { - if bytes.HasPrefix(k, localStorePrefix) { + if bytes.HasPrefix(k, LocalStorePrefix) { return nil, errors.Errorf("store-local key %q is not addressable", k) } if bytes.HasPrefix(k, LocalRangeIDPrefix) { diff --git a/pkg/keys/printer.go b/pkg/keys/printer.go index e3a03bdcd7ff..28fba6fd6d2e 100644 --- a/pkg/keys/printer.go +++ b/pkg/keys/printer.go @@ -71,7 +71,7 @@ var ( // KeyDict drives the pretty-printing and pretty-scanning of the key space. KeyDict = KeyComprehensionTable{ {Name: "/Local", start: localPrefix, end: LocalMax, Entries: []DictEntry{ - {Name: "/Store", prefix: roachpb.Key(localStorePrefix), + {Name: "/Store", prefix: roachpb.Key(LocalStorePrefix), ppFunc: localStoreKeyPrint, PSFunc: localStoreKeyParse}, {Name: "/RangeID", prefix: roachpb.Key(LocalRangeIDPrefix), ppFunc: localRangeIDKeyPrint, PSFunc: localRangeIDKeyParse}, @@ -218,11 +218,11 @@ func localStoreKeyPrint(_ []encoding.Direction, key roachpb.Key) string { if bytes.HasPrefix(key, v.key) { if v.key.Equal(localStoreNodeTombstoneSuffix) { return v.name + "/" + nodeTombstoneKeyPrint( - append(roachpb.Key(nil), append(localStorePrefix, key...)...), + append(roachpb.Key(nil), append(LocalStorePrefix, key...)...), ) } else if v.key.Equal(localStoreCachedSettingsSuffix) { return v.name + "/" + cachedSettingsKeyPrint( - append(roachpb.Key(nil), append(localStorePrefix, key...)...), + append(roachpb.Key(nil), append(LocalStorePrefix, key...)...), ) } return v.name diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index cbcf83cb00ac..c999672787ce 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -416,6 +416,56 @@ func IsEndTxnTriggeringRetryError( const lockResolutionBatchSize = 500 +// iterManager provides a storage.IterAndBuf appropriate for working with a +// span of keys that are either all local or all global keys, identified by +// the start key of the span, that is passed to getIterAndBuf. This is to deal +// with the constraint that a single MVCCIterator using +// MVCCKeyAndIntentsIterKind can either iterate over local keys or global +// keys, but not both. We don't wish to create a new iterator for each span, +// so iterManager lazily creates a new one when needed. +type iterManager struct { + reader storage.Reader + globalKeyUpperBound roachpb.Key + iterAndBuf storage.IterAndBuf + + iter storage.MVCCIterator + isLocalIter bool +} + +func (im *iterManager) getIterAndBuf(key roachpb.Key) storage.IterAndBuf { + isLocal := keys.IsLocal(key) + if im.iter != nil { + if im.isLocalIter == isLocal { + return im.iterAndBuf + } + im.iterAndBuf.SwitchIter(nil /* iter */) + im.iter.Close() + im.iter = nil + } + if isLocal { + im.iter = im.reader.NewMVCCIterator( + storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ + UpperBound: keys.LocalMax, + }) + im.isLocalIter = true + im.iterAndBuf.SwitchIter(im.iter) + } else { + im.iter = im.reader.NewMVCCIterator( + storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ + UpperBound: im.globalKeyUpperBound, + }) + im.isLocalIter = false + im.iterAndBuf.SwitchIter(im.iter) + } + return im.iterAndBuf +} + +func (im *iterManager) Close() { + im.iterAndBuf.Cleanup() + im.iterAndBuf = storage.IterAndBuf{} + im.iter = nil +} + // resolveLocalLocks synchronously resolves any locks that are local to this // range in the same batch and returns those lock spans. The remainder are // collected and returned so that they can be handed off to asynchronous @@ -439,11 +489,12 @@ func resolveLocalLocks( desc = &mergeTrigger.LeftDesc } - iter := readWriter.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ - UpperBound: desc.EndKey.AsRawKey(), - }) - iterAndBuf := storage.GetBufUsingIter(iter) - defer iterAndBuf.Cleanup() + iterManager := &iterManager{ + reader: readWriter, + globalKeyUpperBound: desc.EndKey.AsRawKey(), + iterAndBuf: storage.GetBufUsingIter(nil), + } + defer iterManager.Close() var resolveAllowance int64 = lockResolutionBatchSize if args.InternalCommitTrigger != nil { @@ -466,7 +517,8 @@ func resolveLocalLocks( return nil } resolveMS := ms - ok, err := storage.MVCCResolveWriteIntentUsingIter(ctx, readWriter, iterAndBuf, resolveMS, update) + ok, err := storage.MVCCResolveWriteIntentUsingIter( + ctx, readWriter, iterManager.getIterAndBuf(span.Key), resolveMS, update) if err != nil { return err } @@ -483,7 +535,8 @@ func resolveLocalLocks( externalLocks = append(externalLocks, outSpans...) if inSpan != nil { update.Span = *inSpan - num, resumeSpan, err := storage.MVCCResolveWriteIntentRangeUsingIter(ctx, readWriter, iterAndBuf, ms, update, resolveAllowance) + num, resumeSpan, err := storage.MVCCResolveWriteIntentRangeUsingIter( + ctx, readWriter, iterManager.getIterAndBuf(update.Span.Key), ms, update, resolveAllowance) if err != nil { return err } diff --git a/pkg/kv/kvserver/batcheval/cmd_gc.go b/pkg/kv/kvserver/batcheval/cmd_gc.go index 18a439407b2c..e083b59aa122 100644 --- a/pkg/kv/kvserver/batcheval/cmd_gc.go +++ b/pkg/kv/kvserver/batcheval/cmd_gc.go @@ -66,18 +66,27 @@ func GC( // safe because they can simply be re-collected later on the correct // replica. Discrepancies here can arise from race conditions during // range splitting. - keys := make([]roachpb.GCRequest_GCKey, 0, len(args.Keys)) + globalKeys := make([]roachpb.GCRequest_GCKey, 0, len(args.Keys)) + // Local keys are rarer, so don't pre-allocate slice. We separate the two + // kinds of keys since it is a requirement when calling MVCCGarbageCollect. + var localKeys []roachpb.GCRequest_GCKey for _, k := range args.Keys { if cArgs.EvalCtx.ContainsKey(k.Key) { - keys = append(keys, k) + if keys.IsLocal(k.Key) { + localKeys = append(localKeys, k) + } else { + globalKeys = append(globalKeys, k) + } } } // Garbage collect the specified keys by expiration timestamps. - if err := storage.MVCCGarbageCollect( - ctx, readWriter, cArgs.Stats, keys, h.Timestamp, - ); err != nil { - return result.Result{}, err + for _, gcKeys := range [][]roachpb.GCRequest_GCKey{localKeys, globalKeys} { + if err := storage.MVCCGarbageCollect( + ctx, readWriter, cArgs.Stats, gcKeys, h.Timestamp, + ); err != nil { + return result.Result{}, err + } } // Optionally bump the GC threshold timestamp. diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go index 85ff57b40213..3faa96d60ff7 100644 --- a/pkg/kv/kvserver/client_split_test.go +++ b/pkg/kv/kvserver/client_split_test.go @@ -446,13 +446,13 @@ func TestStoreRangeSplitIntents(t *testing.T) { // Verify the transaction record is gone. start := storage.MakeMVCCMetadataKey(keys.MakeRangeKeyPrefix(roachpb.RKeyMin)) end := storage.MakeMVCCMetadataKey(keys.MakeRangeKeyPrefix(roachpb.RKeyMax)) - iter := store.Engine().NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{UpperBound: roachpb.KeyMax}) + iter := store.Engine().NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{UpperBound: end.Key}) defer iter.Close() for iter.SeekGE(start); ; iter.Next() { if ok, err := iter.Valid(); err != nil { t.Fatal(err) - } else if !ok || !iter.UnsafeKey().Less(end) { + } else if !ok { break } diff --git a/pkg/kv/kvserver/rditer/replica_data_iter.go b/pkg/kv/kvserver/rditer/replica_data_iter.go index cad6e546eb27..36f4b486db26 100644 --- a/pkg/kv/kvserver/rditer/replica_data_iter.go +++ b/pkg/kv/kvserver/rditer/replica_data_iter.go @@ -38,9 +38,14 @@ type KeyRange struct { // TODO(sumeer): merge with ReplicaEngineDataIterator. We can use an EngineIterator // for MVCC key ranges and convert from EngineKey to MVCCKey. type ReplicaMVCCDataIterator struct { + reader storage.Reader curIndex int ranges []KeyRange - it storage.MVCCIterator + // When it is non-nil, it represents the iterator for curIndex. + // A non-nil it is valid, else it is either done, or err != nil. + it storage.MVCCIterator + err error + reverse bool } // ReplicaEngineDataIterator is like ReplicaMVCCDataIterator, but iterates @@ -213,117 +218,128 @@ func MakeUserKeyRange(d *roachpb.RangeDescriptor) KeyRange { // NewReplicaMVCCDataIterator creates a ReplicaMVCCDataIterator for the given // replica. It iterates over the replicated key ranges excluding the lock // table key range. Separated locks are made to appear as interleaved. The -// iterator is initially positioned at the end of the last range. +// iterator can do one of reverse or forward iteration, based on whether +// seekEnd is true or false, respectively. With reverse iteration, it is +// initially positioned at the end of the last range, else it is initially +// positioned at the start of the first range. // -// TODO(sumeer): narrow this interface after changing the test function -// runGCOld(). +// The iterator requires the reader.ConsistentIterators is true, since it +// creates a different iterator for each replicated key range. This is because +// MVCCIterator only allows changing the upper-bound of an existing iterator, +// and not both upper and lower bound. func NewReplicaMVCCDataIterator( d *roachpb.RangeDescriptor, reader storage.Reader, seekEnd bool, ) *ReplicaMVCCDataIterator { - // TODO(sumeer): this is broken for separated intents since the upper bound - // is a global key, but the ranges include replicated range local keys. So - // it underlying iterator used by intentInterleavingIter can iterate up into - // the lock table which is not an MVCCKey. - it := reader.NewMVCCIterator( - storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{UpperBound: d.EndKey.AsRawKey()}) + if !reader.ConsistentIterators() { + panic("ReplicaMVCCDataIterator needs a Reader that provides ConsistentIterators") + } ri := &ReplicaMVCCDataIterator{ - ranges: MakeReplicatedKeyRangesExceptLockTable(d), - it: it, + reader: reader, + ranges: MakeReplicatedKeyRangesExceptLockTable(d), + reverse: seekEnd, } - if seekEnd { - ri.seekEnd() + if ri.reverse { + ri.curIndex = len(ri.ranges) - 1 } else { - ri.seekStart() + ri.curIndex = 0 } + ri.tryCloseAndCreateIter() return ri } -// seekStart seeks the iterator to the start of its data range. -func (ri *ReplicaMVCCDataIterator) seekStart() { - ri.curIndex = 0 - ri.it.SeekGE(ri.ranges[ri.curIndex].Start) - ri.advance() -} - -// seekEnd seeks the iterator to the end of its data range. -func (ri *ReplicaMVCCDataIterator) seekEnd() { - ri.curIndex = len(ri.ranges) - 1 - ri.it.SeekLT(ri.ranges[ri.curIndex].End) - ri.retreat() +func (ri *ReplicaMVCCDataIterator) tryCloseAndCreateIter() { + for { + if ri.it != nil { + ri.it.Close() + ri.it = nil + } + if ri.curIndex < 0 || ri.curIndex >= len(ri.ranges) { + return + } + ri.it = ri.reader.NewMVCCIterator( + storage.MVCCKeyAndIntentsIterKind, + storage.IterOptions{ + LowerBound: ri.ranges[ri.curIndex].Start.Key, + UpperBound: ri.ranges[ri.curIndex].End.Key, + }) + if ri.reverse { + ri.it.SeekLT(ri.ranges[ri.curIndex].End) + } else { + ri.it.SeekGE(ri.ranges[ri.curIndex].Start) + } + if valid, err := ri.it.Valid(); valid || err != nil { + ri.err = err + return + } + if ri.reverse { + ri.curIndex-- + } else { + ri.curIndex++ + } + } } // Close the underlying iterator. func (ri *ReplicaMVCCDataIterator) Close() { - ri.curIndex = len(ri.ranges) - ri.it.Close() + if ri.it != nil { + ri.it.Close() + ri.it = nil + } } // Next advances to the next key in the iteration. func (ri *ReplicaMVCCDataIterator) Next() { + if ri.reverse { + panic("Next called on reverse iterator") + } ri.it.Next() - ri.advance() -} - -// advance moves the iterator forward through the ranges until a valid -// key is found or the iteration is done and the iterator becomes -// invalid. -func (ri *ReplicaMVCCDataIterator) advance() { - for { - if ok, _ := ri.Valid(); ok && ri.it.UnsafeKey().Less(ri.ranges[ri.curIndex].End) { - return - } + valid, err := ri.it.Valid() + if err != nil { + ri.err = err + return + } + if !valid { ri.curIndex++ - if ri.curIndex < len(ri.ranges) { - ri.it.SeekGE(ri.ranges[ri.curIndex].Start) - } else { - return - } + ri.tryCloseAndCreateIter() } } // Prev advances the iterator one key backwards. func (ri *ReplicaMVCCDataIterator) Prev() { + if !ri.reverse { + panic("Prev called on forward iterator") + } ri.it.Prev() - ri.retreat() -} - -// retreat is the opposite of advance. -func (ri *ReplicaMVCCDataIterator) retreat() { - for { - if ok, _ := ri.Valid(); ok && ri.ranges[ri.curIndex].Start.Less(ri.it.UnsafeKey()) { - return - } + valid, err := ri.it.Valid() + if err != nil { + ri.err = err + return + } + if !valid { ri.curIndex-- - if ri.curIndex >= 0 { - ri.it.SeekLT(ri.ranges[ri.curIndex].End) - } else { - return - } + ri.tryCloseAndCreateIter() } } // Valid returns true if the iterator currently points to a valid value. func (ri *ReplicaMVCCDataIterator) Valid() (bool, error) { - ok, err := ri.it.Valid() - ok = ok && ri.curIndex >= 0 && ri.curIndex < len(ri.ranges) - return ok, err + if ri.err != nil { + return false, ri.err + } + if ri.it == nil { + return false, nil + } + return true, nil } // Key returns the current key. Only called in tests. func (ri *ReplicaMVCCDataIterator) Key() storage.MVCCKey { - key := ri.it.UnsafeKey() - keyCopy := make([]byte, len(key.Key)) - copy(keyCopy, key.Key) - key.Key = keyCopy - return key + return ri.it.Key() } // Value returns the current value. Only called in tests. func (ri *ReplicaMVCCDataIterator) Value() []byte { - value := ri.it.UnsafeValue() - valueCopy := make([]byte, len(value)) - copy(valueCopy, value) - return valueCopy + return ri.it.Value() } // UnsafeKey returns the same value as Key, but the memory is invalidated on diff --git a/pkg/kv/kvserver/rditer/replica_data_iter_test.go b/pkg/kv/kvserver/rditer/replica_data_iter_test.go index 84163a137bb3..5222753c014f 100644 --- a/pkg/kv/kvserver/rditer/replica_data_iter_test.go +++ b/pkg/kv/kvserver/rditer/replica_data_iter_test.go @@ -126,13 +126,12 @@ func createRangeData( } func verifyRDReplicatedOnlyMVCCIter( - t *testing.T, - desc *roachpb.RangeDescriptor, - readWriter storage.ReadWriter, - expectedKeys []storage.MVCCKey, + t *testing.T, desc *roachpb.RangeDescriptor, eng storage.Engine, expectedKeys []storage.MVCCKey, ) { t.Helper() verify := func(t *testing.T, useSpanSet, reverse bool) { + readWriter := eng.NewReadOnly() + defer readWriter.Close() if useSpanSet { var spans spanset.SpanSet spans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{ @@ -192,11 +191,10 @@ func verifyRDReplicatedOnlyMVCCIter( } func verifyRDEngineIter( - t *testing.T, - desc *roachpb.RangeDescriptor, - readWriter storage.ReadWriter, - expectedKeys []storage.MVCCKey, + t *testing.T, desc *roachpb.RangeDescriptor, eng storage.Engine, expectedKeys []storage.MVCCKey, ) { + readWriter := eng.NewReadOnly() + defer readWriter.Close() iter := NewReplicaEngineDataIterator(desc, readWriter, false) defer iter.Close() i := 0 diff --git a/pkg/kv/kvserver/rditer/stats.go b/pkg/kv/kvserver/rditer/stats.go index d00cd6cc22c2..324b1c21b905 100644 --- a/pkg/kv/kvserver/rditer/stats.go +++ b/pkg/kv/kvserver/rditer/stats.go @@ -22,16 +22,23 @@ import ( func ComputeStatsForRange( d *roachpb.RangeDescriptor, reader storage.Reader, nowNanos int64, ) (enginepb.MVCCStats, error) { - iter := reader.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{UpperBound: d.EndKey.AsRawKey()}) - defer iter.Close() - ms := enginepb.MVCCStats{} + var err error for _, keyRange := range MakeReplicatedKeyRangesExceptLockTable(d) { - msDelta, err := iter.ComputeStats(keyRange.Start.Key, keyRange.End.Key, nowNanos) + func() { + iter := reader.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, + storage.IterOptions{UpperBound: keyRange.End.Key}) + defer iter.Close() + + var msDelta enginepb.MVCCStats + if msDelta, err = iter.ComputeStats(keyRange.Start.Key, keyRange.End.Key, nowNanos); err != nil { + return + } + ms.Add(msDelta) + }() if err != nil { return enginepb.MVCCStats{}, err } - ms.Add(msDelta) } return ms, nil } diff --git a/pkg/kv/kvserver/replica_consistency.go b/pkg/kv/kvserver/replica_consistency.go index 6bb5c39d2d56..f97a5c8e3fd2 100644 --- a/pkg/kv/kvserver/replica_consistency.go +++ b/pkg/kv/kvserver/replica_consistency.go @@ -571,9 +571,6 @@ func (r *Replica) sha512( statsOnly := mode == roachpb.ChecksumMode_CHECK_STATS // Iterate over all the data in the range. - iter := snap.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{UpperBound: desc.EndKey.AsRawKey()}) - defer iter.Close() - var alloc bufalloc.ByteAllocator var intBuf [8]byte var legacyTimestamp hlc.LegacyTimestamp @@ -635,9 +632,12 @@ func (r *Replica) sha512( // we will probably not have any interleaved intents so we could stop // using MVCCKeyAndIntentsIterKind and consider all locks here. for _, span := range rditer.MakeReplicatedKeyRangesExceptLockTable(&desc) { + iter := snap.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, + storage.IterOptions{UpperBound: span.End.Key}) spanMS, err := storage.ComputeStatsForRange( iter, span.Start.Key, span.End.Key, 0 /* nowNanos */, visitor, ) + iter.Close() if err != nil { return nil, err } diff --git a/pkg/storage/batch_test.go b/pkg/storage/batch_test.go index 470949b4400c..55468cabd41a 100644 --- a/pkg/storage/batch_test.go +++ b/pkg/storage/batch_test.go @@ -110,7 +110,7 @@ func testBatchBasics(t *testing.T, writeOnly bool, commit func(e Engine, b Batch {Key: mvccKey("c"), Value: appender("foo")}, {Key: mvccKey("d"), Value: []byte("before")}, } - kvs, err := Scan(e, roachpb.KeyMin, roachpb.KeyMax, 0) + kvs, err := Scan(e, localMax, roachpb.KeyMax, 0) if err != nil { t.Fatal(err) } @@ -126,7 +126,7 @@ func testBatchBasics(t *testing.T, writeOnly bool, commit func(e Engine, b Batch } if !writeOnly { // Scan values from batch directly. - kvs, err = Scan(b, roachpb.KeyMin, roachpb.KeyMax, 0) + kvs, err = Scan(b, localMax, roachpb.KeyMax, 0) if err != nil { t.Fatal(err) } @@ -139,7 +139,7 @@ func testBatchBasics(t *testing.T, writeOnly bool, commit func(e Engine, b Batch if err := commit(e, b); err != nil { t.Fatal(err) } - kvs, err = Scan(e, roachpb.KeyMin, roachpb.KeyMax, 0) + kvs, err = Scan(e, localMax, roachpb.KeyMax, 0) if err != nil { t.Fatal(err) } @@ -273,7 +273,7 @@ func TestReadOnlyBasics(t *testing.T) { {Key: mvccKey("c"), Value: appender("foobar")}, } - kvs, err := Scan(e, roachpb.KeyMin, roachpb.KeyMax, 0) + kvs, err := Scan(e, localMax, roachpb.KeyMax, 0) if err != nil { t.Fatal(err) } @@ -723,7 +723,7 @@ func TestBatchScanWithDelete(t *testing.T) { if err := b.ClearUnversioned(mvccKey("a").Key); err != nil { t.Fatal(err) } - kvs, err := Scan(b, roachpb.KeyMin, roachpb.KeyMax, 0) + kvs, err := Scan(b, localMax, roachpb.KeyMax, 0) if err != nil { t.Fatal(err) } @@ -761,7 +761,7 @@ func TestBatchScanMaxWithDeleted(t *testing.T) { t.Fatal(err) } // A scan with max=1 should scan "b". - kvs, err := Scan(b, roachpb.KeyMin, roachpb.KeyMax, 1) + kvs, err := Scan(b, localMax, roachpb.KeyMax, 1) if err != nil { t.Fatal(err) } diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go index 2c69877df7fc..0cba52bf07fd 100644 --- a/pkg/storage/bench_test.go +++ b/pkg/storage/bench_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" @@ -416,7 +417,7 @@ func runMVCCScan(ctx context.Context, b *testing.B, emk engineMaker, opts benchS // timings more stable. Otherwise, the first run will be penalized pulling // data into the cache while later runs will not. iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax}) - _, _ = iter.ComputeStats(roachpb.KeyMin, roachpb.KeyMax, 0) + _, _ = iter.ComputeStats(keys.LocalMax, roachpb.KeyMax, 0) iter.Close() } @@ -800,7 +801,7 @@ func runMVCCDeleteRange(ctx context.Context, b *testing.B, emk engineMaker, valu ctx, eng, &enginepb.MVCCStats{}, - roachpb.KeyMin, + keys.LocalMax, roachpb.KeyMax, math.MaxInt64, hlc.MaxTimestamp, @@ -836,9 +837,12 @@ func runClearRange( // // TODO(benesch): when those hacks are removed, don't bother computing the // first key and simply ClearRange(NilKey, MVCCKeyMax). + // + // TODO(sumeer): we are now seeking starting at LocalMax, so the + // aforementioned issue is probably resolved. Clean this up. iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax}) defer iter.Close() - iter.SeekGE(NilKey) + iter.SeekGE(MVCCKey{Key: keys.LocalMax}) if ok, err := iter.Valid(); !ok { b.Fatalf("unable to find first key (err: %v)", err) } @@ -880,7 +884,7 @@ func runMVCCComputeStats(ctx context.Context, b *testing.B, emk engineMaker, val var err error for i := 0; i < b.N; i++ { iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax}) - stats, err = iter.ComputeStats(roachpb.KeyMin, roachpb.KeyMax, 0) + stats, err = iter.ComputeStats(keys.LocalMax, roachpb.KeyMax, 0) iter.Close() if err != nil { b.Fatal(err) @@ -1077,7 +1081,7 @@ func runExportToSst( for i := 0; i < b.N; i++ { startTS := hlc.Timestamp{WallTime: int64(numRevisions / 2)} endTS := hlc.Timestamp{WallTime: int64(numRevisions + 2)} - _, _, _, err := engine.ExportMVCCToSst(roachpb.KeyMin, roachpb.KeyMax, startTS, endTS, + _, _, _, err := engine.ExportMVCCToSst(keys.LocalMax, roachpb.KeyMax, startTS, endTS, exportAllRevisions, 0 /* targetSize */, 0 /* maxSize */, useTBI) if err != nil { b.Fatal(err) diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 1e00818466a9..fda1599b08db 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -147,8 +147,25 @@ type MVCCIterator interface { // and the encoded SST data specified, within the provided key range. Returns // stats on skipped KVs, or an error if a collision is found. CheckForKeyCollisions(sstData []byte, start, end roachpb.Key) (enginepb.MVCCStats, error) - // SetUpperBound installs a new upper bound for this iterator. The caller can modify - // the parameter after this function returns. + // SetUpperBound installs a new upper bound for this iterator. The caller + // can modify the parameter after this function returns. This must not be a + // nil key. When Reader.ConsistentIterators is true, prefer creating a new + // iterator. + // + // Due to the rare use, we are limiting this method to not switch an + // iterator from a global key upper-bound to a local key upper-bound (it + // simplifies some code in intentInterleavingIter) or vice versa. Iterator + // reuse already happens under-the-covers for most Reader implementations + // when constructing a new iterator, and that is a much cleaner solution. + // + // TODO(sumeer): this method is rarely used and is a source of complexity + // since intentInterleavingIter needs to fiddle with the bounds of its + // underlying iterators when this is called. Currently only used by + // pebbleBatch.ClearIterRange to modify the upper bound of the iterator it + // is given: this use is unprincipled and there is a comment in that code + // about it. The caller is already usually setting the bounds accurately, + // and in some cases the callee is tightening the upper bound. Remove that + // use case and remove this from the interface. SetUpperBound(roachpb.Key) // Stats returns statistics about the iterator. Stats() IteratorStats @@ -200,7 +217,9 @@ type EngineIterator interface { // Value returns the current value as a byte slice. // REQUIRES: latest positioning function returned valid=true. Value() []byte - // SetUpperBound installs a new upper bound for this iterator. + // SetUpperBound installs a new upper bound for this iterator. When + // Reader.ConsistentIterators is true, prefer creating a new iterator. + // TODO(sumeer): remove this method. SetUpperBound(roachpb.Key) // GetRawIter is a low-level method only for use in the storage package, // that returns the underlying pebble Iterator. @@ -256,6 +275,23 @@ type MVCCIterKind int const ( // MVCCKeyAndIntentsIterKind specifies that intents must be seen, and appear // interleaved with keys, even if they are in a separated lock table. + // Iterators of this kind are not allowed to span from local to global keys, + // since the physical layout has the separated lock table in-between the + // local and global keys. These iterators do strict error checking and panic + // if the caller seems that to be trying to violate this constraint. + // Specifically: + // - If both bounds are set they must not span from local to global. + // - Any bound (lower or upper), constrains the iterator for its lifetime to + // one of local or global keys. The iterator will not tolerate a seek or + // SetUpperBound call that violates this constraint. + // We could, with significant code complexity, not constrain an iterator for + // its lifetime, and allow a seek that specifies a global (local) key to + // change the constraint to global (local). This would allow reuse of the + // same iterator with a large global upper-bound. But a Next call on the + // highest local key (Prev on the lowest global key) would still not be able + // to transparently skip over the intermediate lock table. We deem that + // behavior to be more surprising and bug-prone (for the caller), than being + // strict. MVCCKeyAndIntentsIterKind MVCCIterKind = iota // MVCCKeyIterKind specifies that the caller does not need to see intents. // Any interleaved intents may be seen, but no correctness properties are @@ -271,7 +307,9 @@ const ( // different iterators created by NewMVCCIterator, NewEngineIterator: // - pebbleSnapshot, because it uses an engine snapshot. // - pebbleReadOnly, pebbleBatch: when the IterOptions do not specify a -// timestamp hint. +// timestamp hint. Note that currently the engine state visible here is +// not as of the time of the Reader creation. It is the time when the +// first iterator is created. // The ConsistentIterators method returns true when this consistency is // guaranteed by the Reader. // TODO(sumeer): this partial consistency can be a source of bugs if future @@ -597,18 +635,9 @@ type Engine interface { // operations) are executed on it and caches iterators to avoid the overhead // of creating multiple iterators for batched reads. // - // All iterators created from a read-only engine with the same "Prefix" - // option are guaranteed to provide a consistent snapshot of the underlying - // engine. For instance, two prefix iterators created from a read-only - // engine will provide a consistent snapshot. Similarly, two non-prefix - // iterators created from a read-only engine will provide a consistent - // snapshot. However, a prefix iterator and a non-prefix iterator created - // from a read-only engine are not guaranteed to provide a consistent view - // of the underlying engine. - // - // TODO(nvanbenschoten): remove this complexity when we're fully on Pebble - // and can guarantee that all iterators created from a read-only engine are - // consistent. To do this, we will want to add an MVCCIterator.Clone method. + // All iterators created from a read-only engine are guaranteed to provide a + // consistent snapshot of the underlying engine. See the comment on the + // Reader interface and the Reader.ConsistentIterators method. NewReadOnly() ReadWriter // NewUnindexedBatch returns a new instance of a batched engine which wraps // this engine. It is unindexed, in that writes to the batch are not diff --git a/pkg/storage/engine_test.go b/pkg/storage/engine_test.go index 4a7bbcc5f19c..b06a266b581f 100644 --- a/pkg/storage/engine_test.go +++ b/pkg/storage/engine_test.go @@ -738,7 +738,7 @@ func TestEngineTimeBound(t *testing.T) { // time bounded iterator instead. iter := batch.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax}) defer iter.Close() - iter.SeekGE(NilKey) + iter.SeekGE(MVCCKey{Key: keys.LocalMax}) var count int for ; ; iter.Next() { @@ -853,10 +853,9 @@ func TestEngineScan1(t *testing.T) { } ensureRangeEqual(t, sortedKeys[1:3], keyMap, keyvals) - // Should return all key/value pairs in lexicographic order. Note that "" - // is the lowest key possible and is a special case in engine.scan, that's - // why we test it here. - startKeys := []roachpb.Key{roachpb.Key("cat"), roachpb.Key("")} + // Should return all key/value pairs in lexicographic order. Note that + // LocalMax is the lowest possible global key. + startKeys := []roachpb.Key{roachpb.Key("cat"), keys.LocalMax} for _, startKey := range startKeys { keyvals, err = Scan(engine, startKey, roachpb.KeyMax, 0) if err != nil { @@ -907,7 +906,7 @@ func TestEngineScan2(t *testing.T) { insertKeys(keys, engine, t) // Scan all keys (non-inclusive of final key). - verifyScan(roachpb.KeyMin, roachpb.KeyMax, 10, keys[:5], engine, t) + verifyScan(localMax, roachpb.KeyMax, 10, keys[:5], engine, t) verifyScan(roachpb.Key("a"), roachpb.KeyMax, 10, keys[:5], engine, t) // Scan sub range. @@ -915,11 +914,11 @@ func TestEngineScan2(t *testing.T) { verifyScan(roachpb.Key("aa0"), roachpb.Key("abcc"), 10, keys[2:5], engine, t) // Scan with max values. - verifyScan(roachpb.KeyMin, roachpb.KeyMax, 3, keys[:3], engine, t) + verifyScan(localMax, roachpb.KeyMax, 3, keys[:3], engine, t) verifyScan(roachpb.Key("a0"), roachpb.KeyMax, 3, keys[1:4], engine, t) // Scan with max value 0 gets all values. - verifyScan(roachpb.KeyMin, roachpb.KeyMax, 0, keys[:5], engine, t) + verifyScan(localMax, roachpb.KeyMax, 0, keys[:5], engine, t) }) } } @@ -942,14 +941,14 @@ func testEngineDeleteRange(t *testing.T, clearRange func(engine Engine, start, e insertKeys(keys, engine, t) // Scan all keys (non-inclusive of final key). - verifyScan(roachpb.KeyMin, roachpb.KeyMax, 10, keys[:5], engine, t) + verifyScan(localMax, roachpb.KeyMax, 10, keys[:5], engine, t) // Delete a range of keys if err := clearRange(engine, mvccKey("aa"), mvccKey("abc")); err != nil { t.Fatal(err) } // Verify what's left - verifyScan(roachpb.KeyMin, roachpb.KeyMax, 10, + verifyScan(localMax, roachpb.KeyMax, 10, []MVCCKey{mvccKey("a"), mvccKey("abc")}, engine, t) }) } @@ -1088,8 +1087,8 @@ func TestSnapshotMethods(t *testing.T) { } // Verify Scan. - keyvals, _ := Scan(engine, roachpb.KeyMin, roachpb.KeyMax, 0) - keyvalsSnapshot, err := Scan(snap, roachpb.KeyMin, roachpb.KeyMax, 0) + keyvals, _ := Scan(engine, localMax, roachpb.KeyMax, 0) + keyvalsSnapshot, err := Scan(snap, localMax, roachpb.KeyMax, 0) if err != nil { t.Fatal(err) } @@ -1100,7 +1099,7 @@ func TestSnapshotMethods(t *testing.T) { // Verify MVCCIterate. index := 0 - if err := snap.MVCCIterate(roachpb.KeyMin, roachpb.KeyMax, MVCCKeyAndIntentsIterKind, func(kv MVCCKeyValue) error { + if err := snap.MVCCIterate(localMax, roachpb.KeyMax, MVCCKeyAndIntentsIterKind, func(kv MVCCKeyValue) error { if !kv.Key.Equal(keys[index]) || !bytes.Equal(kv.Value, vals[index]) { t.Errorf("%d: key/value not equal between expected and snapshot: %s/%s, %s/%s", index, keys[index], vals[index], kv.Key, kv.Value) diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index 78b1d9bbe532..ca32c287e181 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -23,6 +23,14 @@ import ( "github.com/cockroachdb/errors" ) +type intentInterleavingIterConstraint int8 + +const ( + notConstrained intentInterleavingIterConstraint = iota + constrainedToLocal + constrainedToGlobal +) + // intentInterleavingIter makes separated intents appear as interleaved. It // relies on the following assumptions: // - There can also be intents that are physically interleaved. @@ -53,31 +61,25 @@ import ( // // The implementation of intentInterleavingIter assumes callers iterating // forward (reverse) are setting an upper (lower) bound. There is protection -// for misbehavior by the callers for the lock table iterator, which prevents -// that iterator from leaving the lock table key space, by adding additional -// bounds. But we do not manufacture bounds to prevent MVCCIterator to iterate -// into the lock table key space, for the following reasons: -// - Adding a bound where the caller has not specified one adds a key -// comparison when iterating. We don't expect much iteration (next/prev -// calls) over the lock table iterator, because of the rarity of intents, -// but that is not the case for the MVCC key space. -// - The MVCC key space is split into two spans: local keys preceding the lock -// table key space, and global keys. Adding a bound where one is not -// specified by the caller requires us to know which one the caller wants to -// iterate over -- the bounds may not fully specify the intent of the caller -// e.g. a caller could use ["", \xFF\xFF) as the bounds, and use the seek -// key to narrow down iteration over the local key space or the global key -// space. -// Instead, pebbleIterator, the base implementation of MVCCIterator, cheaply -// checks whether it has iterated into the lock table key space, and if so, -// marks itself as exhausted (Valid() returns false, nil). -// -// A limitation of these MVCCIterator implementations, that follows from the -// fact that the MVCC key space is split into two spans, is that one can't -// typically iterate using next/prev from the local MVCC key space to the -// global one and vice versa. One needs to seek in-between. +// for misbehavior by the callers that don't set such bounds, by manufacturing +// bounds. These manufactured bounds prevent the lock table iterator from +// leaving the lock table key space. We also need to manufacture bounds for +// the MVCCIterator to prevent it from iterating into the lock table. Note +// that any manufactured bounds for both the lock table iterator and +// MVCCIterator must be consistent since the intentInterleavingIter does not +// like to see a lock with no corresponding provisional value (it will +// consider than an error). Manufacturing of bounds is complicated by the fact +// that the MVCC key space is split into two spans: local keys preceding the +// lock table key space, and global keys. To manufacture a bound, we need to +// know whether the caller plans to iterate over local or global keys. Setting +// aside prefix iteration, which doesn't need any of these manufactured +// bounds, the call to newIntentInterleavingIter must have specified at least +// one of the lower or upper bound. We use that to "constrain" the iterator as +// either a local key iterator or global key iterator and panic if a caller +// violates that in a subsequent SeekGE/SeekLT/SetUpperBound call. type intentInterleavingIter struct { - prefix bool + prefix bool + constraint intentInterleavingIterConstraint // iter is for iterating over MVCC keys and interleaved intents. iter MVCCIterator @@ -118,8 +120,6 @@ type intentInterleavingIter struct { valid bool err error - hasUpperBound bool - intentKeyBuf []byte } @@ -139,42 +139,75 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator if !opts.MinTimestampHint.IsEmpty() || !opts.MaxTimestampHint.IsEmpty() { panic("intentInterleavingIter must not be used with timestamp hints") } - if opts.LowerBound != nil && opts.UpperBound != nil { - lowerIsLocal := isLocal(opts.LowerBound) - upperIsLocal := isLocal(opts.UpperBound) || bytes.Equal(opts.UpperBound, keys.LocalMax) - if lowerIsLocal != upperIsLocal { + var lowerIsLocal, upperIsLocal bool + var constraint intentInterleavingIterConstraint + if opts.LowerBound != nil { + lowerIsLocal = isLocal(opts.LowerBound) + if lowerIsLocal { + constraint = constrainedToLocal + } else { + constraint = constrainedToGlobal + } + } + if opts.UpperBound != nil { + upperIsLocal = isLocal(opts.UpperBound) || bytes.Equal(opts.UpperBound, keys.LocalMax) + if opts.LowerBound != nil && lowerIsLocal != upperIsLocal { panic(fmt.Sprintf( "intentInterleavingIter cannot span from lowerIsLocal %t, %s to upperIsLocal %t, %s", lowerIsLocal, opts.LowerBound.String(), upperIsLocal, opts.UpperBound.String())) } + if upperIsLocal { + constraint = constrainedToLocal + } else { + constraint = constrainedToGlobal + } + } + if !opts.Prefix { + if opts.LowerBound == nil && opts.UpperBound == nil { + // This is the same requirement as pebbleIterator. + panic("iterator must set prefix or upper bound or lower bound") + } + // At least one bound is specified, so constraint != notConstrained. But + // may need to manufacture a bound for the currently unbounded side. + if opts.LowerBound == nil && constraint == constrainedToGlobal { + // Iterating over global keys, and need a lower-bound, to prevent the MVCCIterator + // from iterating into the lock table. + opts.LowerBound = keys.LocalMax + } + if opts.UpperBound == nil && constraint == constrainedToLocal { + // Iterating over local keys, and need an upper-bound, to prevent the MVCCIterator + // from iterating into the lock table. + opts.UpperBound = keys.LocalRangeLockTablePrefix + } } + // Else prefix iteration, so do not need to manufacture bounds for both + // iterators since the pebble.Iterator implementation will hide the keys + // that do not match the prefix. Note that this is not equivalent to + // constraint==notConstrained -- it is acceptable for a caller to specify a + // bound for prefix iteration, though since they don't need to, most callers + // don't. + intentOpts := opts var intentKeyBuf []byte if opts.LowerBound != nil { intentOpts.LowerBound, intentKeyBuf = keys.LockTableSingleKey(opts.LowerBound, nil) - } else { - // Sometimes callers iterate backwards without having a lower bound. - // Make sure we don't step outside the lock table key space. + } else if !opts.Prefix { + // Make sure we don't step outside the lock table key space. Note that + // this is the case where the lower bound was not set and + // constrainedToLocal. intentOpts.LowerBound = keys.LockTableSingleKeyStart } if opts.UpperBound != nil { intentOpts.UpperBound, _ = keys.LockTableSingleKey(opts.UpperBound, nil) - } else { - // Sometimes callers iterate forwards without having an upper bound. - // Make sure we don't step outside the lock table key space. + } else if !opts.Prefix { + // Make sure we don't step outside the lock table key space. Note that + // this is the case where the upper bound was not set and + // constrainedToGlobal. intentOpts.UpperBound = keys.LockTableSingleKeyEnd } // Note that we can reuse intentKeyBuf after NewEngineIterator returns. intentIter := reader.NewEngineIterator(intentOpts) - // We assume that callers iterating forward will set an upper bound, - // and callers iterating in reverse will set a lower bound, which - // will prevent them from accidentally iterating into the lock-table - // key space. The MVCCIterator implementations require one of the bounds - // or prefix iteration. We remember whether the upper bound has been - // set, so if not set, we can set the upper bound when SeekGE is called - // for prefix iteration. - // // The creation of these iterators can race with concurrent mutations, which // may make them inconsistent with each other. So we clone here, to ensure // consistency (certain Reader implementations already ensure consistency, @@ -187,11 +220,11 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator } iiIter := intentInterleavingIterPool.Get().(*intentInterleavingIter) *iiIter = intentInterleavingIter{ - prefix: opts.Prefix, - iter: iter, - intentIter: intentIter, - hasUpperBound: opts.UpperBound != nil, - intentKeyBuf: intentKeyBuf, + prefix: opts.Prefix, + constraint: constraint, + iter: iter, + intentIter: intentIter, + intentKeyBuf: intentKeyBuf, } return iiIter } @@ -201,15 +234,8 @@ func (i *intentInterleavingIter) SeekGE(key MVCCKey) { i.valid = true i.err = nil - if i.prefix { - // Caller will use a mix of SeekGE and Next. If key is before the lock table key - // space, make sure there is an upper bound, if not explicitly set at creation time - // or using SetUpperBound. We do not set hasUpperBound to true since this is - // an implicit (not set by the user) upper-bound, and we want to change it on - // a subsequent call to SeekGE. - if !i.hasUpperBound && keys.IsLocal(key.Key) && !keys.IsLocalStoreKey(key.Key) { - i.iter.SetUpperBound(keys.LocalRangeLockTablePrefix) - } + if i.constraint != notConstrained { + i.checkConstraint(key.Key, false) } var intentSeekKey roachpb.Key if key.Timestamp.IsEmpty() { @@ -239,6 +265,24 @@ func (i *intentInterleavingIter) SeekGE(key MVCCKey) { i.computePos() } +func (i *intentInterleavingIter) checkConstraint(k roachpb.Key, isExclusiveUpper bool) { + kConstraint := constrainedToGlobal + if isLocal(k) { + if bytes.Compare(k, keys.LocalRangeLockTablePrefix) > 0 { + panic(fmt.Sprintf("intentInterleavingIter cannot be used with invalid local keys %s", + k.String())) + } + kConstraint = constrainedToLocal + } else if isExclusiveUpper && bytes.Equal(k, keys.LocalMax) { + kConstraint = constrainedToLocal + } + if kConstraint != i.constraint { + panic(fmt.Sprintf( + "iterator with constraint=%d is being used with key %s that has constraint=%d", + i.constraint, k.String(), kConstraint)) + } +} + func (i *intentInterleavingIter) computePos() { var err error i.iterValid, err = i.iter.Valid() @@ -546,19 +590,31 @@ func (i *intentInterleavingIter) SeekLT(key MVCCKey) { i.valid = true i.err = nil + if i.prefix { + i.err = errors.Errorf("prefix iteration is not permitted with SeekLT") + i.valid = false + return + } + if i.constraint != notConstrained { + i.checkConstraint(key.Key, true) + if i.constraint == constrainedToLocal && bytes.Equal(key.Key, keys.LocalMax) { + // Move it down to below the lock table so can iterate down cleanly into + // the local key space. Note that we disallow anyone using a seek key + // that is a local key above the lock table, and there should no keys in + // the engine there either (at least not keys that we need to see using + // an MVCCIterator). + key.Key = keys.LocalRangeLockTablePrefix + } + } + var intentSeekKey roachpb.Key if key.Timestamp.IsEmpty() { // Common case. intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key, i.intentKeyBuf) } else { - // Seeking to a specific version, so need to see the intent. - if i.prefix { - i.err = errors.Errorf("prefix iteration is not permitted with SeekLT") - i.valid = false - return - } - // Since we need to see the intent for key.Key, and we don't have SeekLE, call - // Next() on the key before doing SeekLT. + // Seeking to a specific version, so need to see the intent. Since we need + // to see the intent for key.Key, and we don't have SeekLE, call Next() on + // the key before doing SeekLT. intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key.Next(), i.intentKeyBuf) } valid, err := i.intentIter.SeekEngineKeyLT(EngineKey{Key: intentSeekKey}) @@ -752,17 +808,13 @@ func (i *intentInterleavingIter) CheckForKeyCollisions( func (i *intentInterleavingIter) SetUpperBound(key roachpb.Key) { i.iter.SetUpperBound(key) - var intentUpperBound roachpb.Key - if key != nil { - intentUpperBound, i.intentKeyBuf = keys.LockTableSingleKey(key, i.intentKeyBuf) - // Note that we can reuse intentKeyBuf after SetUpperBound returns. - } else { - // Sometimes callers iterate forwards without having an upper bound. - // Make sure we don't step outside the lock table key space. - intentUpperBound = keys.LockTableSingleKeyEnd + // Preceding call to SetUpperBound has confirmed that key != nil. + if i.constraint != notConstrained { + i.checkConstraint(key, true) } + var intentUpperBound roachpb.Key + intentUpperBound, i.intentKeyBuf = keys.LockTableSingleKey(key, i.intentKeyBuf) i.intentIter.SetUpperBound(intentUpperBound) - i.hasUpperBound = key != nil } func (i *intentInterleavingIter) Stats() IteratorStats { diff --git a/pkg/storage/intent_interleaving_iter_test.go b/pkg/storage/intent_interleaving_iter_test.go index fc268a78bba0..b5caafea0c77 100644 --- a/pkg/storage/intent_interleaving_iter_test.go +++ b/pkg/storage/intent_interleaving_iter_test.go @@ -39,6 +39,15 @@ func scanRoachKey(t *testing.T, td *datadriven.TestData, field string) roachpb.K rk := roachpb.Key(k) if strings.HasPrefix(k, "L") { rk = append(keys.LocalRangePrefix, rk[1:]...) + } else if strings.HasPrefix(k, "S") { + rk = append(keys.LocalStorePrefix, rk[1:]...) + } else if strings.HasPrefix(k, "Y") { + rk = append(keys.LocalRangeLockTablePrefix.PrefixEnd(), rk[1:]...) + } else if strings.HasPrefix(k, "Z") { + if len(rk) != 1 { + panic("Z represents LocalMax and should not have more than one character") + } + rk = keys.LocalMax } return bytes.ReplaceAll(rk, []byte("\\0"), []byte{0}) } @@ -46,6 +55,12 @@ func scanRoachKey(t *testing.T, td *datadriven.TestData, field string) roachpb.K func makePrintableKey(k MVCCKey) MVCCKey { if bytes.HasPrefix(k.Key, keys.LocalRangePrefix) { k.Key = append([]byte("L"), k.Key[len(keys.LocalRangePrefix):]...) + } else if bytes.HasPrefix(k.Key, keys.LocalStorePrefix) { + k.Key = append([]byte("S"), k.Key[len(keys.LocalStorePrefix):]...) + } else if bytes.HasPrefix(k.Key, keys.LocalRangeLockTablePrefix.PrefixEnd()) { + k.Key = append([]byte("Y"), k.Key[len(keys.LocalRangeLockTablePrefix):]...) + } else if bytes.Equal(k.Key, keys.LocalMax) { + k.Key = []byte("Z") } k.Key = bytes.ReplaceAll(k.Key, []byte{0}, []byte("\\0")) return k @@ -166,7 +181,12 @@ func checkAndOutputIter(iter MVCCIterator, b *strings.Builder) { // followed by newline separated sequence of operations: // next, prev, seek-lt, seek-ge, set-upper, next-key // -// A key starting with L is interpreted as a local-range key. +// Keys are interpreted as: +// - starting with L is interpreted as a local-range key. +// - starting with S is interpreted as a store local key. +// - starting with Y is interpreted as a local key starting immediately after +// the lock table key space. This is for testing edge cases wrt bounds. +// - a single Z is interpreted as LocalMax func TestIntentInterleavingIter(t *testing.T) { defer leaktest.AfterTest(t)() @@ -350,6 +370,86 @@ func TestIntentInterleavingIter(t *testing.T) { }) } +func TestIntentInterleavingIterBoundaries(t *testing.T) { + defer leaktest.AfterTest(t)() + + eng := createTestPebbleEngine() + defer eng.Close() + // Boundary cases for constrainedToLocal. + func() { + opts := IterOptions{LowerBound: keys.MinKey} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToLocal, iter.constraint) + iter.SetUpperBound(keys.LocalMax) + require.Equal(t, constrainedToLocal, iter.constraint) + iter.SeekLT(MVCCKey{Key: keys.LocalMax}) + iter.Close() + }() + func() { + opts := IterOptions{UpperBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToLocal, iter.constraint) + iter.SetUpperBound(keys.LocalMax) + require.Equal(t, constrainedToLocal, iter.constraint) + iter.Close() + }() + require.Panics(t, func() { + opts := IterOptions{UpperBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + iter.SeekLT(MVCCKey{Key: keys.MaxKey}) + }) + // Boundary cases for constrainedToGlobal + func() { + opts := IterOptions{LowerBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToGlobal, iter.constraint) + iter.Close() + }() + require.Panics(t, func() { + opts := IterOptions{LowerBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToGlobal, iter.constraint) + iter.SetUpperBound(keys.LocalMax) + iter.Close() + }) + require.Panics(t, func() { + opts := IterOptions{LowerBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToGlobal, iter.constraint) + iter.SeekLT(MVCCKey{Key: keys.LocalMax}) + iter.Close() + }) + // Panics for using a local key that is above the lock table. + require.Panics(t, func() { + opts := IterOptions{UpperBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToLocal, iter.constraint) + iter.SeekLT(MVCCKey{Key: keys.LocalRangeLockTablePrefix.PrefixEnd()}) + iter.Close() + }) + require.Panics(t, func() { + opts := IterOptions{UpperBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToLocal, iter.constraint) + iter.SeekGE(MVCCKey{Key: keys.LocalRangeLockTablePrefix.PrefixEnd()}) + iter.Close() + }) + // Prefix iteration does not affect the constraint if bounds are + // specified. + func() { + opts := IterOptions{Prefix: true, LowerBound: keys.LocalMax} + iter := newIntentInterleavingIterator(eng, opts).(*intentInterleavingIter) + require.Equal(t, constrainedToGlobal, iter.constraint) + iter.Close() + }() + // Prefix iteration with no bounds. + func() { + iter := newIntentInterleavingIterator(eng, IterOptions{Prefix: true}).(*intentInterleavingIter) + require.Equal(t, notConstrained, iter.constraint) + iter.Close() + }() +} + type lockKeyValue struct { key LockTableKey val []byte @@ -535,6 +635,7 @@ func doOps(t *testing.T, ops []string, eng Engine, interleave bool, out *strings var seedFlag = flag.Int64("seed", -1, "specify seed to use for random number generator") +// TODO(sumeer): generate a mix of local and global keys. func TestRandomizedIntentInterleavingIter(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/storage/multi_iterator_test.go b/pkg/storage/multi_iterator_test.go index a9634a7ed09d..131d6091d1a1 100644 --- a/pkg/storage/multi_iterator_test.go +++ b/pkg/storage/multi_iterator_test.go @@ -127,7 +127,7 @@ func TestMultiIterator(t *testing.T) { t.Run(subtest.name, func(t *testing.T) { var output bytes.Buffer it := MakeMultiIterator(iters) - for it.SeekGE(MVCCKey{Key: keys.MinKey}); ; subtest.fn(it) { + for it.SeekGE(MVCCKey{Key: keys.LocalMax}); ; subtest.fn(it) { ok, err := it.Valid() if err != nil { t.Fatalf("unexpected error: %+v", err) diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index b96cb24b8bae..e294dccc8198 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -3016,10 +3016,18 @@ func GetBufUsingIter(iter MVCCIterator) IterAndBuf { } } +// SwitchIter switches to iter, and relinquishes ownership of the existing +// iter. +func (b *IterAndBuf) SwitchIter(iter MVCCIterator) { + b.iter = iter +} + // Cleanup must be called to release the resources when done. func (b IterAndBuf) Cleanup() { b.buf.release() - b.iter.Close() + if b.iter != nil { + b.iter.Close() + } } // MVCCResolveWriteIntentRange commits or aborts (rolls back) the @@ -3115,6 +3123,11 @@ func MVCCResolveWriteIntentRangeUsingIter( // timestamp parameter is used to compute the intent age on GC. // // Note that this method will be sorting the keys. +// +// REQUIRES: the keys are either all local keys, or all global keys, and +// not a mix of the two. This is to accommodate the implementation below +// that creates an iterator with bounds that span from the first to last +// key (in sorted order). func MVCCGarbageCollect( ctx context.Context, rw ReadWriter, @@ -3147,6 +3160,7 @@ func MVCCGarbageCollect( // TODO(sumeer): this can span from local to global keys. Fix in cmd_gc.go // which is already examining all the keys and can separate out the local // keys to call MVCCGarbageCollect separately. + // TODO(sumeer): do we even need these coarse bounds? iter := rw.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ LowerBound: keys[0].Key, UpperBound: keys[len(keys)-1].Key.Next(), diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index fd8125e544af..b9410520f5c4 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -96,7 +96,12 @@ func TestMVCCHistories(t *testing.T) { if strings.Contains(path, "_disallow_separated") && !DisallowSeparatedIntents { return } - if strings.Contains(path, "_allow_separated") && DisallowSeparatedIntents { + if strings.Contains(path, "_allow_separated") && + (DisallowSeparatedIntents || enabledSeparatedIntents) { + return + } + if strings.Contains(path, "_enable_separated") && + (DisallowSeparatedIntents || !enabledSeparatedIntents) { return } // We start from a clean slate in every test file. diff --git a/pkg/storage/mvcc_incremental_iterator_test.go b/pkg/storage/mvcc_incremental_iterator_test.go index 82b742bfac74..2dcdecec4b90 100644 --- a/pkg/storage/mvcc_incremental_iterator_test.go +++ b/pkg/storage/mvcc_incremental_iterator_test.go @@ -265,7 +265,6 @@ func TestMVCCIncrementalIteratorNextIgnoringTime(t *testing.T) { ctx := context.Background() var ( - localMax = keys.LocalMax keyMax = roachpb.KeyMax testKey1 = roachpb.Key("/db1") testKey2 = roachpb.Key("/db2") @@ -406,7 +405,6 @@ func TestMVCCIncrementalIterator(t *testing.T) { ctx := context.Background() var ( - localMax = keys.LocalMax keyMax = roachpb.KeyMax testKey1 = roachpb.Key("/db1") testKey2 = roachpb.Key("/db2") @@ -987,7 +985,7 @@ func TestMVCCIterateTimeBound(t *testing.T) { var expectedKVs []MVCCKeyValue iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax}) defer iter.Close() - iter.SeekGE(MVCCKey{}) + iter.SeekGE(MVCCKey{Key: localMax}) for { ok, err := iter.Valid() if err != nil { diff --git a/pkg/storage/mvcc_logical_ops.go b/pkg/storage/mvcc_logical_ops.go index 01c218d75f60..cc198509c243 100644 --- a/pkg/storage/mvcc_logical_ops.go +++ b/pkg/storage/mvcc_logical_ops.go @@ -11,6 +11,7 @@ package storage import ( + "bytes" "fmt" "github.com/cockroachdb/cockroach/pkg/keys" @@ -63,8 +64,6 @@ type MVCCLogicalOpDetails struct { // OpLoggerBatch records a log of logical MVCC operations. type OpLoggerBatch struct { Batch - distinct distinctOpLoggerBatch - distinctOpen bool ops []enginepb.MVCCLogicalOp opsAlloc bufalloc.ByteAllocator @@ -74,7 +73,6 @@ type OpLoggerBatch struct { // wraps the provided batch. func NewOpLoggerBatch(b Batch) *OpLoggerBatch { ol := &OpLoggerBatch{Batch: b} - ol.distinct.parent = ol return ol } @@ -82,9 +80,6 @@ var _ Batch = &OpLoggerBatch{} // LogLogicalOp implements the Writer interface. func (ol *OpLoggerBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { - if ol.distinctOpen { - panic("distinct batch already open") - } ol.logLogicalOp(op, details) ol.Batch.LogLogicalOp(op, details) } @@ -92,6 +87,9 @@ func (ol *OpLoggerBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalO func (ol *OpLoggerBatch) logLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { if keys.IsLocal(details.Key) { // Ignore mvcc operations on local keys. + if bytes.HasPrefix(details.Key, keys.LocalRangeLockTablePrefix) { + panic(fmt.Sprintf("seeing locktable key %s", details.Key.String())) + } return } @@ -153,23 +151,3 @@ func (ol *OpLoggerBatch) LogicalOps() []enginepb.MVCCLogicalOp { } return ol.ops } - -type distinctOpLoggerBatch struct { - ReadWriter - parent *OpLoggerBatch -} - -// LogLogicalOp implements the Writer interface. -func (dlw *distinctOpLoggerBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { - dlw.parent.logLogicalOp(op, details) - dlw.ReadWriter.LogLogicalOp(op, details) -} - -// Close implements the Reader interface. -func (dlw *distinctOpLoggerBatch) Close() { - if !dlw.parent.distinctOpen { - panic("distinct batch not open") - } - dlw.parent.distinctOpen = false - dlw.ReadWriter.Close() -} diff --git a/pkg/storage/mvcc_stats_test.go b/pkg/storage/mvcc_stats_test.go index f64f82971d88..a5cc35eb370e 100644 --- a/pkg/storage/mvcc_stats_test.go +++ b/pkg/storage/mvcc_stats_test.go @@ -51,15 +51,17 @@ func assertEqImpl( t.Errorf("%s: diff(ms, expMS) nontrivial", debug) } - it := rw.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax}) - defer it.Close() keyMin := roachpb.KeyMin + keyMax := keys.LocalMax if globalKeys { keyMin = keys.LocalMax + keyMax = roachpb.KeyMax } + it := rw.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: keyMax}) + defer it.Close() for _, mvccStatsTest := range mvccStatsTests { - compMS, err := mvccStatsTest.fn(it, keyMin, roachpb.KeyMax, ms.LastUpdateNanos) + compMS, err := mvccStatsTest.fn(it, keyMin, keyMax, ms.LastUpdateNanos) if err != nil { t.Fatal(err) } @@ -1415,7 +1417,13 @@ func (s *state) intent(status roachpb.TransactionStatus) roachpb.LockUpdate { } func (s *state) intentRange(status roachpb.TransactionStatus) roachpb.LockUpdate { - intent := roachpb.MakeLockUpdate(s.Txn, roachpb.Span{Key: roachpb.KeyMin, EndKey: roachpb.KeyMax}) + keyMin := keys.LocalMax + keyMax := roachpb.KeyMax + if isLocal(s.key) { + keyMin = roachpb.KeyMin + keyMax = keys.LocalMax + } + intent := roachpb.MakeLockUpdate(s.Txn, roachpb.Span{Key: keyMin, EndKey: keyMax}) intent.Status = status return intent } @@ -1665,7 +1673,7 @@ func TestMVCCComputeStatsError(t *testing.T) { defer iter.Close() for _, mvccStatsTest := range mvccStatsTests { t.Run(mvccStatsTest.name, func(t *testing.T) { - _, err := mvccStatsTest.fn(iter, roachpb.KeyMin, roachpb.KeyMax, 100) + _, err := mvccStatsTest.fn(iter, keys.LocalMax, roachpb.KeyMax, 100) if e := "unable to decode MVCCMetadata"; !testutils.IsError(err, e) { t.Fatalf("expected %s, got %v", e, err) } diff --git a/pkg/storage/mvcc_test.go b/pkg/storage/mvcc_test.go index 7fa1b626b0a3..afacc17d92a6 100644 --- a/pkg/storage/mvcc_test.go +++ b/pkg/storage/mvcc_test.go @@ -961,7 +961,7 @@ func TestMVCCInvalidateIterator(t *testing.T) { _, err = MVCCFindSplitKey(ctx, batch, roachpb.RKeyMin, roachpb.RKeyMax, 64<<20) case "computeStats": iter := batch.NewMVCCIterator(MVCCKeyAndIntentsIterKind, iterOptions) - _, err = iter.ComputeStats(roachpb.KeyMin, roachpb.KeyMax, 0) + _, err = iter.ComputeStats(keys.LocalMax, roachpb.KeyMax, 0) iter.Close() } if err != nil { @@ -4598,7 +4598,7 @@ func TestMVCCGarbageCollect(t *testing.T) { defer iter.Close() for _, mvccStatsTest := range mvccStatsTests { t.Run(mvccStatsTest.name, func(t *testing.T) { - expMS, err := mvccStatsTest.fn(iter, roachpb.KeyMin, roachpb.KeyMax, ts3.WallTime) + expMS, err := mvccStatsTest.fn(iter, localMax, roachpb.KeyMax, ts3.WallTime) if err != nil { t.Fatal(err) } @@ -4696,6 +4696,37 @@ func TestMVCCGarbageCollectIntent(t *testing.T) { } } +// TestMVCCGarbageCollectPanicsWithMixOfLocalAndGlobalKeys verifies that +// MVCCGarbageCollect panics when presented with a mix of local and global +// keys. +func TestMVCCGarbageCollectPanicsWithMixOfLocalAndGlobalKeys(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + if DisallowSeparatedIntents || !enabledSeparatedIntents { + return + } + ctx := context.Background() + for _, engineImpl := range mvccEngineImpls { + t.Run(engineImpl.name, func(t *testing.T) { + engine := engineImpl.create() + defer engine.Close() + + require.Panics(t, func() { + ts := hlc.Timestamp{WallTime: 1e9} + k := roachpb.Key("a") + keys := []roachpb.GCRequest_GCKey{ + {Key: k, Timestamp: ts}, + {Key: keys.RangeDescriptorKey(roachpb.RKey(k))}, + } + if err := MVCCGarbageCollect(ctx, engine, nil, keys, ts); err != nil { + panic(err) + } + }) + }) + } +} + // readWriterReturningSeekLTTrackingIterator is used in a test to inject errors // and ensure that SeekLT is returned an appropriate number of times. type readWriterReturningSeekLTTrackingIterator struct { diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index 3ae1c111312f..6b19dbca5fea 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -620,10 +620,15 @@ func findSplitKeyUsingIterator( // SetUpperBound implements the MVCCIterator interface. func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) { + if upperBound == nil { + panic("SetUpperBound must not use a nil key") + } p.curBuf = (p.curBuf + 1) % 2 i := p.curBuf - p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i][:0], p.options.LowerBound...) - p.options.LowerBound = p.lowerBoundBuf[i] + if p.options.LowerBound != nil { + p.lowerBoundBuf[i] = append(p.lowerBoundBuf[i][:0], p.options.LowerBound...) + p.options.LowerBound = p.lowerBoundBuf[i] + } p.upperBoundBuf[i] = append(p.upperBoundBuf[i][:0], upperBound...) p.upperBoundBuf[i] = append(p.upperBoundBuf[i], 0x00) p.options.UpperBound = p.upperBoundBuf[i] diff --git a/pkg/storage/pebble_test.go b/pkg/storage/pebble_test.go index 8582b662482e..864dd8da893d 100644 --- a/pkg/storage/pebble_test.go +++ b/pkg/storage/pebble_test.go @@ -160,7 +160,11 @@ func TestPebbleIterReuse(t *testing.T) { // previous iterator should get zeroed. iter2 := batch.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: []byte{10}}) valuesCount = 0 - iter2.SeekGE(MVCCKey{Key: []byte{0}}) + // This is a peculiar test that is disregarding how local and global keys + // affect the behavior of MVCCIterators. This test is writing []byte{0} + // which precedes the localPrefix. Ignore the local and preceding keys in + // this seek. + iter2.SeekGE(MVCCKey{Key: []byte{2}}) for ; ; iter2.Next() { ok, err := iter2.Valid() if err != nil { @@ -176,8 +180,8 @@ func TestPebbleIterReuse(t *testing.T) { valuesCount++ } - if valuesCount != 10 { - t.Fatalf("expected 10 values, got %d", valuesCount) + if valuesCount != 8 { + t.Fatalf("expected 8 values, got %d", valuesCount) } iter2.Close() } diff --git a/pkg/storage/testdata/intent_interleaving_iter/basic b/pkg/storage/testdata/intent_interleaving_iter/basic index c2cf1d659acd..c7b7551da8fb 100644 --- a/pkg/storage/testdata/intent_interleaving_iter/basic +++ b/pkg/storage/testdata/intent_interleaving_iter/basic @@ -563,3 +563,226 @@ prev: output: meta k=b\0c\0d ts=20.000000000,0 txn=1 prev: output: value k=abcdefg\0 ts=20.000000000,0 v=a prev: output: meta k=abcdefg\0 ts=20.000000000,0 txn=1 prev: output: . + +# Local and global keys with separated locks. This test exercises previously +# buggy cases where callers were: +# - iterating over global keys without a lower bound. Since the manufactured +# lower bounds on the intentIter and iter, to prevent iter iterating into +# the lock table, were not semantically identical, the +# intentInterleavingIter would detect an inconsistency and go into error +# state. +# - iterating over local keys without an upper bound. We had the same issue +# with the manufactured bounds. + +define +locks +meta k=Lb ts=20 txn=2 +meta k=Lc ts=30 txn=4 +meta k=b ts=40 txn=5 +meta k=d ts=50 txn=6 +mvcc +meta k=La ts=10 txn=1 +value k=La ts=10 v=a10 +value k=Lb ts=20 v=b20 +value k=Lc ts=30 v=c30 +value k=b ts=40 v=b40 +value k=d ts=50 v=d50 +---- + +iter upper=e +seek-ge k=a +next +next +prev +prev +prev +next +---- +seek-ge "a"/0,0: output: meta k=b ts=40.000000000,0 txn=5 +next: output: value k=b ts=40.000000000,0 v=b40 +next: output: meta k=d ts=50.000000000,0 txn=6 +prev: output: value k=b ts=40.000000000,0 v=b40 +prev: output: meta k=b ts=40.000000000,0 txn=5 +prev: output: . +next: output: meta k=b ts=40.000000000,0 txn=5 + +iter lower=La +seek-lt k=Ld +prev +prev +next +next +next +prev +---- +seek-lt "Ld"/0,0: output: value k=Lc ts=30.000000000,0 v=c30 +prev: output: meta k=Lc ts=30.000000000,0 txn=4 +prev: output: value k=Lb ts=20.000000000,0 v=b20 +next: output: meta k=Lc ts=30.000000000,0 txn=4 +next: output: value k=Lc ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=Lc ts=30.000000000,0 v=c30 + +iter prefix=true +seek-ge k=Lb +next +next +seek-ge k=La +next +next +seek-ge k=Laa +seek-ge k=b +next +next +seek-ge k=d +next +next +seek-ge k=e +seek-ge k=c +---- +seek-ge "Lb"/0,0: output: meta k=Lb ts=20.000000000,0 txn=2 +next: output: value k=Lb ts=20.000000000,0 v=b20 +next: output: . +seek-ge "La"/0,0: output: meta k=La ts=10.000000000,0 txn=1 +next: output: value k=La ts=10.000000000,0 v=a10 +next: output: . +seek-ge "Laa"/0,0: output: . +seek-ge "b"/0,0: output: meta k=b ts=40.000000000,0 txn=5 +next: output: value k=b ts=40.000000000,0 v=b40 +next: output: . +seek-ge "d"/0,0: output: meta k=d ts=50.000000000,0 txn=6 +next: output: value k=d ts=50.000000000,0 v=d50 +next: output: . +seek-ge "e"/0,0: output: . +seek-ge "c"/0,0: output: . + +# The meta Sc is bogus since local store keys do not have locks, but it is +# worthwhile for the intentInterleavingIter to work cleanly here. +define +locks +meta k=Lb ts=20 txn=2 +meta k=Sc ts=30 txn=3 +meta k=d ts=40 txn=4 +mvcc +value k=Lb ts=20 v=b20 +value k=Sc ts=30 v=c30 +value k=d ts=40 v=d40 +---- + +# Iterator sees Sc, which it should. +iter lower=La +seek-ge k=La +prev +next +prev +next +next +next +next +next +prev +---- +seek-ge "La"/0,0: output: meta k=Lb ts=20.000000000,0 txn=2 +prev: output: . +next: output: meta k=Lb ts=20.000000000,0 txn=2 +prev: output: . +next: output: meta k=Lb ts=20.000000000,0 txn=2 +next: output: value k=Lb ts=20.000000000,0 v=b20 +next: output: meta k=Sc ts=30.000000000,0 txn=3 +next: output: value k=Sc ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=Sc ts=30.000000000,0 v=c30 + +# Iterator sees Lb, which it should. +# error state. +iter upper=Sd +seek-lt k=Sd +prev +prev +prev +prev +next +---- +seek-lt "Sd"/0,0: output: value k=Sc ts=30.000000000,0 v=c30 +prev: output: meta k=Sc ts=30.000000000,0 txn=3 +prev: output: value k=Lb ts=20.000000000,0 v=b20 +prev: output: meta k=Lb ts=20.000000000,0 txn=2 +prev: output: . +next: output: meta k=Lb ts=20.000000000,0 txn=2 + +# Iterator over local keys, with upper bound equal to LocalMax. The underlying +# iterator over the MVCC key space will stop itself when it encounters the +# lock table keys (which are also local keys), without an error. +iter upper=Z +seek-ge k=Lb +next +next +next +next +prev +---- +seek-ge "Lb"/0,0: output: meta k=Lb ts=20.000000000,0 txn=2 +next: output: value k=Lb ts=20.000000000,0 v=b20 +next: output: meta k=Sc ts=30.000000000,0 txn=3 +next: output: value k=Sc ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=Sc ts=30.000000000,0 v=c30 + +# Similar to previous test, but with an upper bound less than LocalMax but +# above the lock table key space. The result is the same +iter upper=Yc +seek-ge k=Lb +next +next +next +next +prev +---- +seek-ge "Lb"/0,0: output: meta k=Lb ts=20.000000000,0 txn=2 +next: output: value k=Lb ts=20.000000000,0 v=b20 +next: output: meta k=Sc ts=30.000000000,0 txn=3 +next: output: value k=Sc ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=Sc ts=30.000000000,0 v=c30 + +# Write some keys above the lock table in the local key space, which should +# not happen in a correct system. Also note that the local store key with +# non-zero timestamps is bogus, but harmless. +define +locks +meta k=Lb ts=20 txn=2 +meta k=e ts=50 txn=3 +mvcc +value k=Lb ts=20 v=b20 +value k=Sc ts=30 v=c30 +value k=Yd ts=40 v=d40 +value k=e ts=50 v=e50 +---- + +# The intentInterleavingIter cannot see the Yd key, when iterating up from +# keys below the lock table. +iter upper=Z +seek-ge k=Lb +next +next +next +prev +---- +seek-ge "Lb"/0,0: output: meta k=Lb ts=20.000000000,0 txn=2 +next: output: value k=Lb ts=20.000000000,0 v=b20 +next: output: value k=Sc ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=Sc ts=30.000000000,0 v=c30 + +# When iterating backwards from LocalMax, we should see all the local keys, +# except for the Yd key. +iter upper=Z +seek-lt k=Z +prev +prev +next +---- +seek-lt "Z"/0,0: output: value k=Sc ts=30.000000000,0 v=c30 +prev: output: value k=Lb ts=20.000000000,0 v=b20 +prev: output: meta k=Lb ts=20.000000000,0 txn=2 +next: output: value k=Lb ts=20.000000000,0 v=b20 diff --git a/pkg/storage/testdata/mvcc_histories/conditional_put_with_txn_enable_separated b/pkg/storage/testdata/mvcc_histories/conditional_put_with_txn_enable_separated new file mode 100644 index 000000000000..ec971f7fa6ce --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/conditional_put_with_txn_enable_separated @@ -0,0 +1,110 @@ +run ok +txn_begin t=A ts=123 +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=123.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=123.000000000,0 wto=false max=0,0 + +# Write value1. + +run ok +with t=A + txn_step + cput k=k v=v +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=123.000000000,0 min=0,0 seq=1} lock=true stat=PENDING rts=123.000000000,0 wto=false max=0,0 +meta: "k"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=123.000000000,0 min=0,0 seq=1} ts=123.000000000,0 del=false klen=12 vlen=6 +data: "k"/123.000000000,0 -> /BYTES/v + +# Now, overwrite value1 with value2 from same txn; should see value1 +# as pre-existing value. + +run ok +with t=A + txn_step + cput k=k v=v2 cond=v +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=123.000000000,0 min=0,0 seq=2} lock=true stat=PENDING rts=123.000000000,0 wto=false max=0,0 +meta: "k"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=123.000000000,0 min=0,0 seq=2} ts=123.000000000,0 del=false klen=12 vlen=7 ih={{1 /BYTES/v}} +data: "k"/123.000000000,0 -> /BYTES/v2 + +# Writing value3 from a new epoch should see nil again. + +run ok +with t=A + txn_restart + txn_step + cput k=k v=v3 +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=1 ts=123.000000000,0 min=0,0 seq=1} lock=true stat=PENDING rts=123.000000000,0 wto=false max=0,0 +meta: "k"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=1 ts=123.000000000,0 min=0,0 seq=1} ts=123.000000000,0 del=false klen=12 vlen=7 +data: "k"/123.000000000,0 -> /BYTES/v3 + +# Commit value3 at a later timestamp. + +run ok +with t=A + txn_advance ts=124 + resolve_intent k=k + txn_remove +---- +>> at end: +data: "k"/124.000000000,0 -> /BYTES/v3 + +# Write value4 with an old timestamp without txn...should get a write +# too old error. + +run error +cput k=k v=v4 cond=v3 ts=123 +---- +>> at end: +data: "k"/124.000000000,1 -> /BYTES/v4 +data: "k"/124.000000000,0 -> /BYTES/v3 +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write at timestamp 123.000000000,0 too old; wrote at 124.000000000,1 + +# Reset for next test + +run ok +clear_range k=k end=-k +---- +>> at end: + + +# From TxnCoordSenderRetries, +# "multi-range batch with forwarded timestamp and cput and delete range" + +# First txn attempt + +run ok +# Before txn start: +put k=c v=value ts=1 +with t=A + txn_begin ts=2 + txn_step + cput k=c v=cput cond=value +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=1} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +meta: "c"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=1} ts=2.000000000,0 del=false klen=12 vlen=9 +data: "c"/2.000000000,0 -> /BYTES/cput +data: "c"/1.000000000,0 -> /BYTES/value + +# Restart and retry cput. It should succeed. + +run trace ok +with t=A + txn_restart ts=3 + txn_step + cput k=c v=cput cond=value +---- +>> txn_restart ts=3 t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=1 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false max=0,0 +>> txn_step t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=1 ts=3.000000000,0 min=0,0 seq=1} lock=true stat=PENDING rts=3.000000000,0 wto=false max=0,0 +>> cput k=c v=cput cond=value t=A +called PutIntent("c", _, ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000002) +meta: "c"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=1 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=9 +data: "c"/3.000000000,0 -> /BYTES/cput +data: "c"/1.000000000,0 -> /BYTES/value diff --git a/pkg/storage/testdata/mvcc_histories/intent_history_enable_separated b/pkg/storage/testdata/mvcc_histories/intent_history_enable_separated new file mode 100644 index 000000000000..6ebd146bfdcb --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/intent_history_enable_separated @@ -0,0 +1,58 @@ +## Write the base (default) value. + +run ok +with t=A + txn_begin ts=1 + put k=a v=default resolve + txn_remove +---- +>> at end: +data: "a"/1.000000000,0 -> /BYTES/default + +## See how the intent history evolves throughout the test. + +run trace ok +with t=A + txn_begin ts=2 + with k=a + put v=first + txn_step + put v=second + txn_step n=2 + del + txn_step n=6 + put v=first + resolve_intent +---- +>> txn_begin ts=2 t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> put v=first k=a t=A +called PutIntent("a", _, NoExistingIntent, TDNUM(true), 00000000-0000-0000-0000-000000000002) +meta: "a"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=0} ts=2.000000000,0 del=false klen=12 vlen=10 +data: "a"/2.000000000,0 -> /BYTES/first +data: "a"/1.000000000,0 -> /BYTES/default +>> txn_step k=a t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=1} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> put v=second k=a t=A +called PutIntent("a", _, ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000002) +meta: "a"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=1} ts=2.000000000,0 del=false klen=12 vlen=11 ih={{0 /BYTES/first}} +data: "a"/2.000000000,0 -> /BYTES/second +data: "a"/1.000000000,0 -> /BYTES/default +>> txn_step n=2 k=a t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=3} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> del k=a t=A +called PutIntent("a", _, ExistingIntentSeparated, TDNUM(false), 00000000-0000-0000-0000-000000000002) +meta: "a"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=3} ts=2.000000000,0 del=true klen=12 vlen=0 ih={{0 /BYTES/first}{1 /BYTES/second}} +data: "a"/2.000000000,0 -> / +data: "a"/1.000000000,0 -> /BYTES/default +>> txn_step n=6 k=a t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=9} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> put v=first k=a t=A +called PutIntent("a", _, ExistingIntentSeparated, TDNUM(false), 00000000-0000-0000-0000-000000000002) +meta: "a"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=9} ts=2.000000000,0 del=false klen=12 vlen=10 ih={{0 /BYTES/first}{1 /BYTES/second}{3 /}} +data: "a"/2.000000000,0 -> /BYTES/first +data: "a"/1.000000000,0 -> /BYTES/default +>> resolve_intent k=a t=A +called ClearIntent("a", ExistingIntentSeparated, TDNUM(false), 00000000-0000-0000-0000-000000000002) +data: "a"/2.000000000,0 -> /BYTES/first +data: "a"/1.000000000,0 -> /BYTES/default diff --git a/pkg/storage/testdata/mvcc_histories/intent_with_write_tracing_enable_separated b/pkg/storage/testdata/mvcc_histories/intent_with_write_tracing_enable_separated new file mode 100644 index 000000000000..f5003acca1bf --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/intent_with_write_tracing_enable_separated @@ -0,0 +1,87 @@ +run trace ok +with t=A + txn_begin ts=2 + put k=k1 v=v1 +---- +>> txn_begin ts=2 t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> put k=k1 v=v1 t=A +called PutIntent("k1", _, NoExistingIntent, TDNUM(true), 00000000-0000-0000-0000-000000000001) +meta: "k1"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=2.000000000,0 min=0,0 seq=0} ts=2.000000000,0 del=false klen=12 vlen=7 +data: "k1"/2.000000000,0 -> /BYTES/v1 + +run trace ok +with t=A + txn_advance ts=3 + txn_step + put k=k1 v=v1 + put k=k2 v=v2 +---- +>> txn_advance ts=3 t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> txn_step t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} lock=true stat=PENDING rts=2.000000000,0 wto=false max=0,0 +>> put k=k1 v=v1 t=A +called PutIntent("k1", _, ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000001) +meta: "k1"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 ih={{0 /BYTES/v1}} +data: "k1"/3.000000000,0 -> /BYTES/v1 +>> put k=k2 v=v2 t=A +called PutIntent("k2", _, NoExistingIntent, TDNUM(true), 00000000-0000-0000-0000-000000000001) +meta: "k1"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 ih={{0 /BYTES/v1}} +data: "k1"/3.000000000,0 -> /BYTES/v1 +meta: "k2"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 +data: "k2"/3.000000000,0 -> /BYTES/v2 + +run trace ok +put k=k3 v=v3 ts=1 +---- +>> put k=k3 v=v3 ts=1 +meta: "k1"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 ih={{0 /BYTES/v1}} +data: "k1"/3.000000000,0 -> /BYTES/v1 +meta: "k2"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 +data: "k2"/3.000000000,0 -> /BYTES/v2 +data: "k3"/1.000000000,0 -> /BYTES/v3 + +run trace ok +with t=A + put k=k3 v=v33 +---- +>> put k=k3 v=v33 t=A +called PutIntent("k3", _, NoExistingIntent, TDNUM(true), 00000000-0000-0000-0000-000000000001) +meta: "k1"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 ih={{0 /BYTES/v1}} +data: "k1"/3.000000000,0 -> /BYTES/v1 +meta: "k2"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 +data: "k2"/3.000000000,0 -> /BYTES/v2 +meta: "k3"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=8 +data: "k3"/3.000000000,0 -> /BYTES/v33 +data: "k3"/1.000000000,0 -> /BYTES/v3 + +# transactionDidNotUpdateMeta (TDNUM) is false below for k2 and k3 since +# disallowSeparatedIntents=true causes mvcc.go to always set it to false to maintain +# consistency in a mixed version cluster. +run trace ok +with t=A + resolve_intent k=k1 + resolve_intent k=k2 status=ABORTED + resolve_intent k=k3 status=ABORTED + txn_remove +---- +>> resolve_intent k=k1 t=A +called ClearIntent("k1", ExistingIntentSeparated, TDNUM(false), 00000000-0000-0000-0000-000000000001) +data: "k1"/3.000000000,0 -> /BYTES/v1 +meta: "k2"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=7 +data: "k2"/3.000000000,0 -> /BYTES/v2 +meta: "k3"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=8 +data: "k3"/3.000000000,0 -> /BYTES/v33 +data: "k3"/1.000000000,0 -> /BYTES/v3 +>> resolve_intent k=k2 status=ABORTED t=A +called ClearIntent("k2", ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000001) +data: "k1"/3.000000000,0 -> /BYTES/v1 +meta: "k3"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=1} ts=3.000000000,0 del=false klen=12 vlen=8 +data: "k3"/3.000000000,0 -> /BYTES/v33 +data: "k3"/1.000000000,0 -> /BYTES/v3 +>> resolve_intent k=k3 status=ABORTED t=A +called ClearIntent("k3", ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000001) +data: "k1"/3.000000000,0 -> /BYTES/v1 +data: "k3"/1.000000000,0 -> /BYTES/v3 +>> txn_remove t=A diff --git a/pkg/storage/testdata/mvcc_histories/no_read_after_abort_enable_separated b/pkg/storage/testdata/mvcc_histories/no_read_after_abort_enable_separated new file mode 100644 index 000000000000..c6f4b72169f2 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/no_read_after_abort_enable_separated @@ -0,0 +1,30 @@ +## Simple txn that aborts. + +run trace ok +with t=A k=a + txn_begin ts=22 + put v=cde + resolve_intent status=ABORTED + txn_remove +---- +>> txn_begin ts=22 t=A k=a +txn: "A" meta={id=00000000 key="a" pri=0.00000000 epo=0 ts=22.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=22.000000000,0 wto=false max=0,0 +>> put v=cde t=A k=a +called PutIntent("a", _, NoExistingIntent, TDNUM(true), 00000000-0000-0000-0000-000000000001) +meta: "a"/0,0 -> txn={id=00000000 key="a" pri=0.00000000 epo=0 ts=22.000000000,0 min=0,0 seq=0} ts=22.000000000,0 del=false klen=12 vlen=8 +data: "a"/22.000000000,0 -> /BYTES/cde +>> resolve_intent status=ABORTED t=A k=a +called ClearIntent("a", ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000001) + +>> txn_remove t=A k=a + +# Cannot read aborted value. + +run ok +with t=A + txn_begin ts=23 + get k=a + txn_remove +---- +get: "a" -> +>> at end: diff --git a/pkg/storage/testdata/mvcc_histories/read_after_write_enable_separated b/pkg/storage/testdata/mvcc_histories/read_after_write_enable_separated new file mode 100644 index 000000000000..04c6b3fec08f --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/read_after_write_enable_separated @@ -0,0 +1,91 @@ +## A simple txn that commits. + +run trace ok +with t=A + txn_begin ts=11 + with k=a + put v=abc + get + resolve_intent +---- +>> txn_begin ts=11 t=A +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=11.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=11.000000000,0 wto=false max=0,0 +>> put v=abc k=a t=A +called PutIntent("a", _, NoExistingIntent, TDNUM(true), 00000000-0000-0000-0000-000000000001) +meta: "a"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=11.000000000,0 min=0,0 seq=0} ts=11.000000000,0 del=false klen=12 vlen=8 +data: "a"/11.000000000,0 -> /BYTES/abc +>> get k=a t=A +get: "a" -> /BYTES/abc @11.000000000,0 +>> resolve_intent k=a t=A +called ClearIntent("a", ExistingIntentSeparated, TDNUM(true), 00000000-0000-0000-0000-000000000001) +data: "a"/11.000000000,0 -> /BYTES/abc + +run ok +with t=A resolve + put k=a/1 v=eee + put k=b v=fff + put k=b/2 v=ggg + put k=c v=hhh + txn_remove +---- +>> at end: +data: "a"/11.000000000,0 -> /BYTES/abc +data: "a/1"/11.000000000,0 -> /BYTES/eee +data: "b"/11.000000000,0 -> /BYTES/fff +data: "b/2"/11.000000000,0 -> /BYTES/ggg +data: "c"/11.000000000,0 -> /BYTES/hhh + +# Reads previous writes, transactional. + +run ok +with t=A + txn_begin ts=11 + get k=a +---- +get: "a" -> /BYTES/abc @11.000000000,0 +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=11.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=11.000000000,0 wto=false max=0,0 + +run trace ok +with t=A + scan k=a end==b + scan k=a end=+a + scan k=a end=-a + scan k=a end=+b + scan k=a end==b + scan k=a end=-b + txn_remove +---- +>> scan k=a end==b t=A +scan: "a" -> /BYTES/abc @11.000000000,0 +scan: "a/1" -> /BYTES/eee @11.000000000,0 +>> scan k=a end=+a t=A +scan: "a" -> /BYTES/abc @11.000000000,0 +>> scan k=a end=-a t=A +scan: "a" -> /BYTES/abc @11.000000000,0 +scan: "a/1" -> /BYTES/eee @11.000000000,0 +>> scan k=a end=+b t=A +scan: "a" -> /BYTES/abc @11.000000000,0 +scan: "a/1" -> /BYTES/eee @11.000000000,0 +scan: "b" -> /BYTES/fff @11.000000000,0 +>> scan k=a end==b t=A +scan: "a" -> /BYTES/abc @11.000000000,0 +scan: "a/1" -> /BYTES/eee @11.000000000,0 +>> scan k=a end=-b t=A +scan: "a" -> /BYTES/abc @11.000000000,0 +scan: "a/1" -> /BYTES/eee @11.000000000,0 +scan: "b" -> /BYTES/fff @11.000000000,0 +scan: "b/2" -> /BYTES/ggg @11.000000000,0 +>> txn_remove t=A + + +## A simple txn anchored at some arbitrary key. + +run trace ok +with t=A k=a + txn_begin ts=1 + txn_remove +---- +>> txn_begin ts=1 t=A k=a +txn: "A" meta={id=00000000 key="a" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false max=0,0 +>> txn_remove t=A k=a