diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index 42757860faa5..29c8e4d99115 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" @@ -35,19 +36,122 @@ const ( constrainedToGlobal ) -// intentInterleavingIter makes separated intents appear as interleaved. It -// relies on the following assumptions: -// - There can also be intents that are physically interleaved. -// However, for a particular roachpb.Key there will be at most one intent, -// either interleaved or separated. -// - An intent will have a corresponding provisional value. +// commitMap maintains a logical map of TxnIDs of transactions that were +// "simple" in their behavior, and have committed, where simple is defined as +// all the following conditions: +// - No savepoint rollbacks: len(intent.IgnoredSeqNums)==0 +// - Single epoch, i.e., TxnMeta.Epoch==0 +// - Never pushed, i.e., TxnMeta.MinTimestamp==TxnMeta.WriteTimestamp +// For such transactions, their provisional values can be considered committed +// with the current version and local timestamp, i.e., we need no additional +// information about the txn other than the TxnID. +// +// Adding to commitMap: +// The earliest a txn can be added to the commitMap is when the transaction is +// in STAGING and has verified all the conditions for commit. That is, this +// can be done before the transition to COMMITTED is replicated. For the node +// that has the leaseholder of the range containing the txn record, this +// requires no external communication. For other nodes with intents for the +// txn, one could piggyback this information on the RPCs for async intent +// resolution, and add to the the commitMap before doing the intent resolution +// -- this piggybacking would incur 2 consensus rounds of contention. If we +// are willing to send the RPC earlier, it will be contention for 1 consensus +// round only. Note that these RPCs should also remove locks from the +// non-persistent lock table data-structure so should send information about +// the keys (like in a LockUpdate but won't remove the durable lock state). +// +// Removal from commitMap: +// The commitMap can be considered a cache of TxnIDs. It is helpful to have a +// txn in the cache until its intents have been resolved. Additionally, +// latchless intent resolution must pin a txn in the map before it releases +// latches and unpin when the intent resolution has been applied on the +// leaseholder. This pinning behavior is needed to ensure the correctness of +// the in-memory concurrency.lockTable, which must maintain the property that +// the replicated locks known to it are a subset of the persistent replicated +// locks. We are assuming here that there is no lockTable on followers. +// +// commitMap per range or node?: +// Scoping it to a range may be simpler, and result in smaller maps that are +// more efficient (see synchronization section below). But the same TxnID can +// be in multiple maps, so there is the overhead of duplication. +// +// Synchronization: +// An ideal data-structure would offer very efficient O(1) lookup without the +// need to acquire a mutex. So a copy-on-write approach would be ideal for +// reads. We also need to support a high rate of insertions. Removal can be +// batched, say by removing everything added > 50ms in the past. +// +// TODO: the current prototype focuses on changes to the read-path so just use +// a map. +type commitMap struct { + simpleCommits map[uuid.UUID]struct{} +} + +func (cm *commitMap) isPresent(txnID uuid.UUID) bool { + if cm == nil || cm.simpleCommits == nil { + return false + } + _, present := cm.simpleCommits[txnID] + return present +} + +// Multiple intents for a key: +// +// We allow a key (with multiple versions) to have multiple intents, under the +// condition that at most one of the intents is uncommitted. +// Additionally: +// - We don't want to have to coordinate intent resolution of these multiple +// intents, by mandating that the resolution happen in any particular order +// +// - We want to guarantee that even if the commitMap is cleared (since it is a +// cache), we can maintain the invariant that a caller iterating over a key +// sees at most one intent. As we will illustrate below, providing this +// guarantee requires us to limit the commitMap to only contain +// simple-committed txns. +// +// Consider a key with timestamps t5, t4, t3, t2, t1 in decreasing order and +// intents for t5, t4, t2, with corresponding txns txn5, ... txn1. We consider +// the disposition of an intent to be either unknown or simple-committed. In +// this example, the disposition of the intent for t4 and t2 is +// simple-committed solely based on the fact that there is at least one +// version (provisional or otherwise) more recent that the timestamp of the +// intent. That is, at most one intent, the one for t5, has a disposition that +// needs to rely on knowledge that is not self-contained in the history. For +// t5, we must rely on the commitMap to decide whether is unknown or +// simple-committed. It is possible that some user of the +// intentInterleavingIter saw t5 as simple-committed and a later user sees it +// as unknown disposition, if the txn5 got removed from the commitMap -- such +// regression is harmless since the latter user will simply have to do intent +// resolution. Note that the intent for t5 could get resolved before those for +// t4 and t2, and that is also fine since the disposition of t4, t2 stays +// simple-committed. If txn5 is aborted and the intent for t5 removed, and +// txn4 is no longer in the commitMap, the disposition of t4 could change to +// unknown. This is also acceptable, since t5 was only serving as a "local +// promise" that t4 was committed, which is simply an optimization. There is +// still a less efficient globally available "promise" that t4 is committed, +// and intent resolution of t4 is how we will enact that promise. +// +// Maintaining the above guarantees requires that historical versions must not +// be garbage collected without resolving intents. This is acceptable since GC +// is not latency sensitive. + +// TODO: don't forget to fix the iteration for intent resolution in mvcc.go + +// intentInterleavingIter makes physically separated intents appear as +// logically interleaved. It assumes that: +// - There are no physically interleaved intents. +// - An intent must have a corresponding value (possibly provisional). // - The only single key locks in the lock table key space are intents. // +// +// TODO: update following comment since we now have multiple intents and we are +// disallowing physically interleaved intents. +// // Semantically, the functionality is equivalent to merging two MVCCIterators: // - A MVCCIterator on the MVCC key space. // - A MVCCIterator constructed by wrapping an EngineIterator on the lock table // key space where the EngineKey is transformed into the corresponding -// intent key and appears as MVCCKey{Key: intentKey}. +// intent key and appears as MVCCKey{Key: intentIterKey}. // The implementation below is specialized to reduce unnecessary comparisons // and iteration, by utilizing the aforementioned assumptions. The intentIter // iterates over the lock table key space and iter over the MVCC key space. @@ -85,7 +189,7 @@ type intentInterleavingIter struct { prefix bool constraint intentInterleavingIterConstraint - // iter is for iterating over MVCC keys and interleaved intents. + // iter is for iterating over MVCC keys. iter MVCCIterator // The valid value from iter.Valid() after the last positioning call. iterValid bool @@ -96,28 +200,103 @@ type intentInterleavingIter struct { // intentIter is for iterating over separated intents, so that // intentInterleavingIter can make them look as if they were interleaved. - intentIter EngineIterator + // + // If we observe an intent whose decoded key is equal to the current iter + // key, we need to iterate over all such intents, and + // - check whether the txnID is in the commitMap. + // - decode the MVCCMetadata and compare the MVCCMetadata.Timestamps. The only + // potential unknown disposition MVCCMetadata is the one with the highest + // timestamp. Its disposition will be determined using the commitMap and + // based on whether there are more recent committed versions. + intentIter EngineIterator + // The validity state of intentIter. intentIterState pebble.IterValidityState + // TODO(sumeer): change MVCCIterator interface so that caller can use what + // the intentInterleavingIter has parsed and avoid parsing again. + // + // Parsing cost: consider the following cases + // - No contention: reads and writes that are using an + // intentInterleavingIter will not see intents, and so there is no parsing + // of MVCCMetadata. + // - Read-write contention: intentInterleavingIter will see a single intent + // and parse it. It either has simple-committed or unknown disposition. + // Prior to supporting simple-committed disposition, the caller would have + // parsed the intent, so there is no additional cost (assuming we avoid + // repeated parsing, as mentioned above). With simple-committed disposition, + // the intentInterleavingIter will parse and discard, which is fine -- we + // have avoided false contention. + // - Write-write contention: intentInterleavingIter could see multiple + // intents and it has to parse them all. We claim that the saved false + // contention justifies this cost. + // + // mvccMetadata, when non-nil, represents an unknown disposition intent that + // is the current position of the interleaving iter. That is, this is the + // intent corresponding to the highest version and is not in the commitMap. + // + // mvccMetadata is populated by iterating over all the intents corresponding + // to a key. + // - For forward iteration this is done when the intent key and the version + // key, i.e., intentIterKey and iterKey become equal. As a result of this + // iteration, the intentIter is moved past all the intents for that key + // (see numIntents for how many intents were encountered). + // - For reverse iteration this is done when iterating past the latest + // provisional value. The intentIter will also move past all the intents + // for that key (in the reverse direction). + // Note that both cases share the similarity that mvccMetadata is being + // populated when, if non-nil, this intent has to be exposed now to the + // caller of intentInterleavingIter. + mvccMetadata *enginepb.MVCCMetadata + // The number of intents seen in the last call to tryInitIntentState. This + // count is used to decide how many times we need to step the intentIter + // when switching from forward to reverse iteration and vice versa, and the + // interleavingIter was currently positioned at the intent. + numIntents int + // intentKey is the key for the corresponding mvccMetadata. It is not the + // same as intentIterKey since the intentIter has been moved past all the + // intents for this key. + intentKey roachpb.Key + // Parsing buffers. If mvccMetadata != nil, it points to one of the entries + // in metadataBuf. + metadataBuf [2]enginepb.MVCCMetadata + // largestVersionSeen is used in reverse iteration so that we can determine + // whether an intent is older than another version and therefore + // simple-committed. + largestVersionSeen hlc.Timestamp + // The last limit key used in *WithLimit calls. We remember it so that it + // can be used later for iteration over the lock table, when consuming all + // the intents for a key. + lastLimitKey roachpb.Key + // The decoded key from the lock table. This is an unsafe key // in that it is only valid when intentIter has not been // repositioned. It is nil if the intentIter is considered to be // exhausted. Note that the intentIter may still be positioned // at a valid position in the case of prefix iteration, but the - // state of the intentKey overrides that state. - intentKey roachpb.Key - intentKeyAsNoTimestampMVCCKey []byte - intentKeyAsNoTimestampMVCCKeyBacking []byte + // intentIterKey may be set to nil to override that state. + intentIterKey roachpb.Key + intentKeyAsNoTimestampMVCCKey []byte - // - cmp output of (intentKey, current iter key) when both are valid. + // intentCmp is used to represent the relationship of an intent key with + // the current iter key. When mvccMetadata is non-nil, the intent key is + // intentKey: + // - Forward direction: since there are versions corresponding to the + // mvccMetadata, this will always be 0. + // - Reverse direction: since iter has been moved before the intent, this + // will always be +1. + // By definition, valid=true in this case. + // + // The common case is mvccMetadata is nil, i.e., the interleaving iter is not + // positioned at an intent. In that case, this is defined as: + // - cmp output of (intentIterKey, current iter key) when both are valid. // This does not take timestamps into consideration. So if intentIter // is at an intent, and iter is at the corresponding provisional value, // cmp will be 0. See the longer struct-level comment for more on the // relative positioning of intentIter and iter. - // - intentKey==nil, iterValid==true, cmp=dir + // - intentIterKey==nil, iterValid==true, cmp=dir // (i.e., the nil key is akin to infinity in the forward direction // and -infinity in the reverse direction, since that iterator is // exhausted). - // - intentKey!=nil, iterValid=false, cmp=-dir. + // - intentIterKey!=nil, iterValid=false, cmp=-dir. // - If both are invalid. cmp is undefined and valid=false. intentCmp int // The current direction. +1 for forward, -1 for reverse. @@ -127,14 +306,14 @@ type intentInterleavingIter struct { // Buffers to reuse memory when constructing lock table keys for bounds and // seeks. - intentKeyBuf []byte + intentSeekKeyBuf []byte intentLimitKeyBuf []byte -} -// TODO(bananabrick): Update intent interleaving iter so that -// it doesn't understand interleaved intents. As of now, cockroach -// can't write new interleaved intents, but can read them using -// this iterator. + // Assumption: commitMap is akin to a snapshot, i.e., will not lose entries + // while intentInterleavingIter is using it. Gaining new entries is + // harmless. + commitMap *commitMap +} var _ MVCCIterator = &intentInterleavingIter{} @@ -202,10 +381,10 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator iiIter := intentInterleavingIterPool.Get().(*intentInterleavingIter) intentOpts := opts - intentKeyBuf := iiIter.intentKeyBuf + intentSeekKeyBuf := iiIter.intentSeekKeyBuf intentLimitKeyBuf := iiIter.intentLimitKeyBuf if opts.LowerBound != nil { - intentOpts.LowerBound, intentKeyBuf = keys.LockTableSingleKey(opts.LowerBound, intentKeyBuf) + intentOpts.LowerBound, intentSeekKeyBuf = keys.LockTableSingleKey(opts.LowerBound, intentSeekKeyBuf) } else if !opts.Prefix { // Make sure we don't step outside the lock table key space. Note that // this is the case where the lower bound was not set and @@ -237,13 +416,13 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator } *iiIter = intentInterleavingIter{ - prefix: opts.Prefix, - constraint: constraint, - iter: iter, - intentIter: intentIter, - intentKeyAsNoTimestampMVCCKeyBacking: iiIter.intentKeyAsNoTimestampMVCCKeyBacking, - intentKeyBuf: intentKeyBuf, - intentLimitKeyBuf: intentLimitKeyBuf, + prefix: opts.Prefix, + constraint: constraint, + iter: iter, + intentIter: intentIter, + intentKey: iiIter.intentKey, + intentSeekKeyBuf: intentSeekKeyBuf, + intentLimitKeyBuf: intentLimitKeyBuf, } return iiIter } @@ -254,10 +433,16 @@ func newIntentInterleavingIterator(reader Reader, opts IterOptions) MVCCIterator // strengthening the semantics and using the tightness of these limits to // avoid comparisons between iterKey and intentKey. -// makeUpperLimitKey uses the current value of i.iterKey.Key (and assumes -// i.iterValid=true), to construct an exclusive upper limit roachpb.Key that -// will include the intent for i.iterKey.Key. -func (i *intentInterleavingIter) makeUpperLimitKey() roachpb.Key { +// makeUpperLimitKey uses the current value of i.iterKey.Key (if +// i.iterValid=true and the iterator is not doing prefix iteration), to +// construct an exclusive upper limit roachpb.Key that will include the intent +// for i.iterKey.Key. If the above condition is not satisfied, the limit key +// is nil, i.e., no limit. The limit key is stored in i.lastLimitKey. +func (i *intentInterleavingIter) makeUpperLimitKey() { + i.lastLimitKey = nil + if !i.iterValid || i.prefix { + return + } key := i.iterKey.Key // The +2 is to account for the call to BytesNext and the need to append a // '\x00' in the implementation of the *WithLimit function. The rest is the @@ -277,13 +462,19 @@ func (i *intentInterleavingIter) makeUpperLimitKey() roachpb.Key { // the Pebble level, which is all we need here. We don't actually use // BytesNext since it tries not to overwrite the slice. i.intentLimitKeyBuf = append(i.intentLimitKeyBuf, '\x00') - return i.intentLimitKeyBuf + i.lastLimitKey = i.intentLimitKeyBuf } -// makeLowerLimitKey uses the current value of i.iterKey.Key (and assumes +// makeLowerLimitKey uses the current value of i.iterKey.Key (if // i.iterValid=true), to construct an inclusive lower limit roachpb.Key that -// will include the intent for i.iterKey.Key. -func (i *intentInterleavingIter) makeLowerLimitKey() roachpb.Key { +// will include the intent for i.iterKey.Key. If the above condition is not +// satisfied, the limit key is nil, i.e., no limit. The limit key is stored in +// i.lastLimitKey. +func (i *intentInterleavingIter) makeLowerLimitKey() { + i.lastLimitKey = nil + if !i.iterValid { + return + } key := i.iterKey.Key // The +1 is to account for the need to append a '\x00' in the // implementation of the *WithLimit function. The rest is the same as in the @@ -296,13 +487,14 @@ func (i *intentInterleavingIter) makeLowerLimitKey() roachpb.Key { i.intentLimitKeyBuf = make([]byte, 0, keyLen) } _, i.intentLimitKeyBuf = keys.LockTableSingleKey(key, i.intentLimitKeyBuf) - return i.intentLimitKeyBuf + i.lastLimitKey = i.intentLimitKeyBuf } func (i *intentInterleavingIter) SeekGE(key MVCCKey) { i.dir = +1 i.valid = true i.err = nil + i.mvccMetadata = nil if i.constraint != notConstrained { i.checkConstraint(key.Key, false) @@ -314,31 +506,34 @@ func (i *intentInterleavingIter) SeekGE(key MVCCKey) { var intentSeekKey roachpb.Key if key.Timestamp.IsEmpty() { // Common case. - intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key, i.intentKeyBuf) + intentSeekKey, i.intentSeekKeyBuf = keys.LockTableSingleKey(key.Key, i.intentSeekKeyBuf) } else if !i.prefix { // Seeking to a specific version, so go past the intent. - intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key.Next(), i.intentKeyBuf) + // NB: the caller should only be doing this if it has observed the intent, + // or knows there is no intent, since it may now be exposed to a + // provisional value. + intentSeekKey, i.intentSeekKeyBuf = keys.LockTableSingleKey(key.Key.Next(), i.intentSeekKeyBuf) } else { // Else seeking to a particular version and using prefix iteration, // so don't expect to ever see the intent. NB: intentSeekKey is nil. - i.intentKey = nil + i.intentIterKey = nil } if intentSeekKey != nil { - var limitKey roachpb.Key - if i.iterValid && !i.prefix { - limitKey = i.makeUpperLimitKey() - } - iterState, err := i.intentIter.SeekEngineKeyGEWithLimit(EngineKey{Key: intentSeekKey}, limitKey) + i.makeUpperLimitKey() + iterState, err := i.intentIter.SeekEngineKeyGEWithLimit( + EngineKey{Key: intentSeekKey}, i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } } - i.computePos() + i.computePos(false) } func (i *intentInterleavingIter) SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID) { i.dir = +1 i.valid = true + i.err = nil + i.mvccMetadata = nil if i.constraint != notConstrained { i.checkConstraint(key, false) @@ -348,20 +543,17 @@ func (i *intentInterleavingIter) SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID return } var engineKey EngineKey - engineKey, i.intentKeyBuf = LockTableKey{ + engineKey, i.intentSeekKeyBuf = LockTableKey{ Key: key, Strength: lock.Exclusive, TxnUUID: txnUUID[:], - }.ToEngineKey(i.intentKeyBuf) - var limitKey roachpb.Key - if i.iterValid && !i.prefix { - limitKey = i.makeUpperLimitKey() - } - iterState, err := i.intentIter.SeekEngineKeyGEWithLimit(engineKey, limitKey) + }.ToEngineKey(i.intentSeekKeyBuf) + i.makeUpperLimitKey() + iterState, err := i.intentIter.SeekEngineKeyGEWithLimit(engineKey, i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } - i.computePos() + i.computePos(false) } func (i *intentInterleavingIter) checkConstraint(k roachpb.Key, isExclusiveUpper bool) { @@ -393,24 +585,138 @@ func (i *intentInterleavingIter) tryDecodeKey() error { return i.err } -// Assumes that i.err != nil. And i.iterValid and i.iterKey are up to date. -func (i *intentInterleavingIter) computePos() { - if !i.iterValid && i.intentKey == nil { +// Assumes that i.err != nil. And i.iterValid, i.iterKey, i.intentIterKey are +// up to date. +func (i *intentInterleavingIter) computePos(isSeekLT bool) { + if !i.iterValid && i.intentIterKey == nil { i.valid = false return } // INVARIANT: i.iterValid || i.intentKey != nil if !i.iterValid { - i.intentCmp = -i.dir + if i.dir == +1 { + i.err = errors.Errorf("intent with no version") + i.valid = false + return + } + // i.dir == -1, i.e., reverse iteration. + if isSeekLT { + i.err = errors.Errorf("SeekLT cannot be used to skip the latest version and land on intent") + i.valid = false + return + } + i.intentCmp = +1 + i.tryInitIntentState() return } - if i.intentKey == nil { + if i.intentIterKey == nil { i.intentCmp = i.dir + return + } + // iterValid && intentIterKey != nil + i.intentCmp = i.intentIterKey.Compare(i.iterKey.Key) + if i.intentCmp <= 0 && i.dir == +1 { + if i.intentCmp < 0 { + i.err = errors.Errorf("intent with no version") + i.valid = false + return + } + i.tryInitIntentState() + } else if i.intentCmp > 0 && i.dir == -1 { + if isSeekLT { + i.err = errors.Errorf("SeekLT cannot be used to skip the latest version and land on intent") + i.valid = false + return + } + i.tryInitIntentState() + } else if isSeekLT { + i.largestVersionSeen = i.iterKey.Timestamp + } +} + +func (i *intentInterleavingIter) tryInitIntentState() error { + i.mvccMetadata = nil + i.numIntents = 0 + i.intentKey = append(i.intentKey[:0], i.intentIterKey...) + nextParseIndex := 0 + for { + err := protoutil.Unmarshal(i.intentIter.UnsafeValue(), &i.metadataBuf[nextParseIndex]) + i.numIntents++ + if err != nil { + i.err = err + i.valid = false + return err + } + meta := &i.metadataBuf[nextParseIndex] + if i.mvccMetadata != nil { + if i.mvccMetadata.Timestamp.Less(meta.Timestamp) { + // Committed. + i.mvccMetadata = nil + if !i.commitMap.isPresent(meta.Txn.ID) { + i.mvccMetadata = meta + nextParseIndex = 1 - nextParseIndex + } + } + // Else meta is committed. + } else if !i.commitMap.isPresent(meta.Txn.ID) { + i.mvccMetadata = meta + nextParseIndex = 1 - nextParseIndex + } + var iterState pebble.IterValidityState + if i.dir == +1 { + iterState, err = i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) + } else { + iterState, err = i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) + } + if err = i.tryDecodeLockKey(iterState, err); err != nil { + return err + } + if i.intentIterKey == nil || !i.intentIterKey.Equal(i.intentKey) { + break + } + } + // Check that mvccMetadata is not simple-committed because of being an older + // version. + if i.mvccMetadata != nil && + ((i.dir == +1 && i.mvccMetadata.Timestamp.ToTimestamp().Less(i.iterKey.Timestamp)) || + (i.dir == -1 && i.mvccMetadata.Timestamp.ToTimestamp().Less(i.largestVersionSeen))) { + // Simple committed. + i.mvccMetadata = nil + } + if i.mvccMetadata == nil { + // tryInitIntentState was called in the following cases, both of which + // need fixing of i.intentCmp, since we don't actually have an intent. + // - i.dir == +1 and i.intentCmp == 0 + // - i.dir == -1 and i.intentCmp > 0 + if i.dir == +1 { + // We know there are versioned value(s) with a key equal to all the + // intents we have skipped over. + i.intentCmp = +1 + } else { + // Reverse iteration. + if i.iterValid { + i.intentCmp = i.intentIterKey.Compare(i.iterKey.Key) + } else { + // There shouldn't be anything more to read from intentIter too. + if i.intentIterKey != nil { + i.err = errors.Errorf("intent with no version") + i.valid = false + return i.err + } + } + } + if !i.iterValid && i.intentIterKey == nil { + i.valid = false + } } else { - i.intentCmp = i.intentKey.Compare(i.iterKey.Key) + // We found one intent that should be seen by the caller. + i.intentKeyAsNoTimestampMVCCKey = i.intentKey.Next() } + + return nil } +// tryDecodeLockKey is called after intentIter is moved. func (i *intentInterleavingIter) tryDecodeLockKey( iterState pebble.IterValidityState, err error, ) error { @@ -425,7 +731,7 @@ func (i *intentInterleavingIter) tryDecodeLockKey( // about the state of i.iter, which may be valid. It is the caller's // responsibility to additionally use the state of i.iter to appropriately // set i.valid. - i.intentKey = nil + i.intentIterKey = nil return nil } engineKey, err := i.intentIter.UnsafeEngineKey() @@ -434,35 +740,11 @@ func (i *intentInterleavingIter) tryDecodeLockKey( i.valid = false return err } - if i.intentKey, err = keys.DecodeLockTableSingleKey(engineKey.Key); err != nil { + if i.intentIterKey, err = keys.DecodeLockTableSingleKey(engineKey.Key); err != nil { i.err = err i.valid = false return err } - // If we were to encode MVCCKey{Key: i.intentKey}, i.e., encode it as an - // MVCCKey with no timestamp, the encoded bytes would be intentKey + \x00. - // Such an encoding is needed by callers of UnsafeRawMVCCKey. We would like - // to avoid copying the bytes in intentKey, if possible, for this encoding. - // Fortunately, the common case in the above call of - // DecodeLockTableSingleKey, that decodes intentKey from engineKey.Key, is - // for intentKey to not need un-escaping, so it will point to the slice that - // was backing engineKey.Key. engineKey.Key uses an encoding that terminates - // the intent key using \x00\x01. So the \x00 we need is conveniently there. - // This optimization also usually works when there is un-escaping, since the - // slice growth algorithm usually ends up with a cap greater than len. Since - // these extra bytes in the cap are 0-initialized, the first byte following - // intentKey is \x00. - // - // If this optimization is not possible, we leave - // intentKeyAsNoTimestampMVCCKey as nil, and lazily initialize it, if - // needed. - i.intentKeyAsNoTimestampMVCCKey = nil - if cap(i.intentKey) > len(i.intentKey) { - prospectiveKey := i.intentKey[:len(i.intentKey)+1] - if prospectiveKey[len(i.intentKey)] == 0 { - i.intentKeyAsNoTimestampMVCCKey = prospectiveKey - } - } return nil } @@ -486,22 +768,26 @@ func (i *intentInterleavingIter) Next() { if err := i.tryDecodeKey(); err != nil { return } - var limitKey roachpb.Key - if i.iterValid && !i.prefix { - limitKey = i.makeUpperLimitKey() - } - iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey) + i.makeUpperLimitKey() + iterState, err := i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } - i.computePos() + i.computePos(false) return } // At least one of the iterators is not exhausted. + // Cases: + // - isCurAtIntent: + // - iter is physically before intentKey, so need to step it once forward, so that is + // looks like how forward iteration looks when the interleaving iter is at an intent. + // intentIter needs to be stepped numIntents times. + // - !isCurAtIntent: intentIter needs to be stepped once so that it looks like forward iteration + // when the interleaving iter is at a version. if isCurAtIntent { - // iter precedes the intentIter, so must be at the lowest version of the + // iter precedes the intentKey, so must be at the lowest version of the // preceding key or exhausted. So step it forward. It will now point to - // a key that is the same as the intent key since an intent always has a + // a key that is the same as the intentKey since an intent always has a // corresponding provisional value, and provisional values must have a // higher timestamp than any committed value on a key. Note that the // code below does not specifically care if a bug (external to this @@ -526,24 +812,39 @@ func (i *intentInterleavingIter) Next() { return } } + i.makeUpperLimitKey() + var iterState pebble.IterValidityState + var err error + for j := 0; j < i.numIntents; j++ { + iterState, err = i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) + } + if err := i.tryDecodeLockKey(iterState, err); err != nil { + return + } + // At the last intent for that key. + if i.intentIterKey == nil { + i.err = errors.Errorf("intent not found") + i.valid = false + return + } + // One past the intent since we have consumed all of them to construct + // i.mvccMetadata. + iterState, err = i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) + if err := i.tryDecodeLockKey(iterState, err); err != nil { + return + } } else { // The intentIter precedes the iter. It could be for the same key, iff // this key has an intent, or an earlier key. Either way, stepping - // forward will take it to an intent for a later key. Note that iter - // could also be positioned at an intent. We are assuming that there - // isn't a bug (external to this code) that has caused two intents to be - // present for the same key. - var limitKey roachpb.Key - if !i.prefix { - limitKey = i.makeUpperLimitKey() - } - iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey) + // forward will take it to an intent for a later key. + i.makeUpperLimitKey() + iterState, err := i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } i.intentCmp = +1 if util.RaceEnabled && iterState == pebble.IterValid { - cmp := i.intentKey.Compare(i.iterKey.Key) + cmp := i.intentIterKey.Compare(i.iterKey.Key) if cmp <= 0 { i.err = errors.Errorf("intentIter incorrectly positioned, cmp: %d", cmp) i.valid = false @@ -555,12 +856,13 @@ func (i *intentInterleavingIter) Next() { if !i.valid { return } - if i.intentCmp <= 0 { - // The iterator is positioned at an intent in intentIter. iter must be - // positioned at the provisional value. Note that the code below does not - // specifically care if a bug (external to this code) violates the - // invariant that the iter is pointing to the provisional value, but it - // does care that iter is pointing to some version of that key. + if i.isCurAtIntentIter() { + i.mvccMetadata = nil + // iter must be positioned at the provisional value. Note that the code + // below does not specifically care if a bug (external to this code) + // violates the invariant that the iter is pointing to the provisional + // value, but it does care that iter is pointing to some version of that + // key. if i.intentCmp != 0 { i.err = errors.Errorf("intentIter at intent, but iter not at provisional value") i.valid = false @@ -571,17 +873,9 @@ func (i *intentInterleavingIter) Next() { i.valid = false return } - var limitKey roachpb.Key - if !i.prefix { - limitKey = i.makeUpperLimitKey() - } - iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey) - if err = i.tryDecodeLockKey(iterState, err); err != nil { - return - } i.intentCmp = +1 - if util.RaceEnabled && i.intentKey != nil { - cmp := i.intentKey.Compare(i.iterKey.Key) + if util.RaceEnabled && i.intentIterKey != nil { + cmp := i.intentIterKey.Compare(i.iterKey.Key) if cmp <= 0 { i.err = errors.Errorf("intentIter incorrectly positioned, cmp: %d", cmp) i.valid = false @@ -590,22 +884,21 @@ func (i *intentInterleavingIter) Next() { } } else { // Common case: - // The iterator is positioned at iter. It could be a value or an intent, - // though usually it will be a value. + // The iterator is positioned at iter. i.iter.Next() if err := i.tryDecodeKey(); err != nil { return } - if i.intentIterState == pebble.IterAtLimit && i.iterValid && !i.prefix { + if i.intentIterState == pebble.IterAtLimit { // TODO(sumeer): could avoid doing this if i.iter has stepped to // different version of same key. - limitKey := i.makeUpperLimitKey() - iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey) + i.makeUpperLimitKey() + iterState, err := i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } } - i.computePos() + i.computePos(false) } } @@ -620,73 +913,35 @@ func (i *intentInterleavingIter) NextKey() { if !i.valid { return } - if i.intentCmp <= 0 { - // The iterator is positioned at an intent in intentIter. iter must be - // positioned at the provisional value. + if i.isCurAtIntentIter() { + i.mvccMetadata = nil + // iter must be positioned at the provisional value. if i.intentCmp != 0 { i.err = errors.Errorf("intentIter at intent, but iter not at provisional value") i.valid = false return } - // Step the iter to NextKey(), i.e., past all the versions of this key. - i.iter.NextKey() - if err := i.tryDecodeKey(); err != nil { - return - } - var limitKey roachpb.Key - if i.iterValid && !i.prefix { - limitKey = i.makeUpperLimitKey() - } - iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey) - if err := i.tryDecodeLockKey(iterState, err); err != nil { - return - } - i.computePos() - return } - // The iterator is positioned at iter. It could be a value or an intent, - // though usually it will be a value. // Step the iter to NextKey(), i.e., past all the versions of this key. i.iter.NextKey() if err := i.tryDecodeKey(); err != nil { return } - if i.intentIterState == pebble.IterAtLimit && i.iterValid && !i.prefix { - limitKey := i.makeUpperLimitKey() - iterState, err := i.intentIter.NextEngineKeyWithLimit(limitKey) - if err = i.tryDecodeLockKey(iterState, err); err != nil { + if i.intentIterState == pebble.IterAtLimit { + i.makeUpperLimitKey() + iterState, err := i.intentIter.NextEngineKeyWithLimit(i.lastLimitKey) + if err := i.tryDecodeLockKey(iterState, err); err != nil { return } } - i.computePos() + i.computePos(false) } func (i *intentInterleavingIter) isCurAtIntentIter() bool { - // When both iter and intentIter are exhausted, the return value is - // immaterial since this function won't be called. We examine the remaining - // cases below. - // - // During forward iteration (dir > 0), we have the following cases: - // - iter is exhausted: intentCmp < 0. This will never happen and callers - // check. Returns true. - // - intentIter is exhausted: intentCmp > 0. Returns false. - // - Neither is exhausted: - // - intentCmp < 0. This will never happen and callers check. Returns true. - // - intentCmp = 0. Returns true. - // - intentCmp > 0. Returns false. - // - // During reverse iteration (dir < 0), we have the following cases: - // - iter is exhausted: intentCmp > 0. Returns true. - // - intentIter is exhausted: intentCmp < 0. Returns false. - // - Neither is exhausted: - // - intentCmp <= 0. Returns false. - // - intentCmp > 0. Returns true. - return (i.dir > 0 && i.intentCmp <= 0) || (i.dir < 0 && i.intentCmp > 0) + return i.mvccMetadata != nil } func (i *intentInterleavingIter) UnsafeKey() MVCCKey { - // If there is a separated intent there cannot also be an interleaved intent - // for the same key. if i.isCurAtIntentIter() { return MVCCKey{Key: i.intentKey} } @@ -694,8 +949,13 @@ func (i *intentInterleavingIter) UnsafeKey() MVCCKey { } func (i *intentInterleavingIter) UnsafeValue() []byte { - if i.isCurAtIntentIter() { - return i.intentIter.UnsafeValue() + if i.mvccMetadata != nil { + // TODO: the interface change will eliminate this wasteful re-serialization. + val, err := protoutil.Marshal(i.mvccMetadata) + if err != nil { + panic(errors.AssertionFailedf("unmarshaled MVCCMetadata is failing marshaling %s", err)) + } + return val } return i.iter.UnsafeValue() } @@ -709,8 +969,13 @@ func (i *intentInterleavingIter) Key() MVCCKey { } func (i *intentInterleavingIter) Value() []byte { - if i.isCurAtIntentIter() { - return i.intentIter.Value() + if i.mvccMetadata != nil { + // TODO: the interface change will eliminate this wasteful re-serialization. + val, err := protoutil.Marshal(i.mvccMetadata) + if err != nil { + panic(errors.AssertionFailedf("unmarshaled MVCCMetadata is failing marshaling %s", err)) + } + return val } return i.iter.Value() } @@ -719,9 +984,9 @@ func (i *intentInterleavingIter) Close() { i.iter.Close() i.intentIter.Close() *i = intentInterleavingIter{ - intentKeyAsNoTimestampMVCCKeyBacking: i.intentKeyAsNoTimestampMVCCKeyBacking, - intentKeyBuf: i.intentKeyBuf, - intentLimitKeyBuf: i.intentLimitKeyBuf, + intentKey: i.intentKey, + intentSeekKeyBuf: i.intentSeekKeyBuf, + intentLimitKeyBuf: i.intentLimitKeyBuf, } intentInterleavingIterPool.Put(i) } @@ -730,6 +995,7 @@ func (i *intentInterleavingIter) SeekLT(key MVCCKey) { i.dir = -1 i.valid = true i.err = nil + i.mvccMetadata = nil if i.prefix { i.err = errors.Errorf("prefix iteration is not permitted with SeekLT") @@ -767,22 +1033,29 @@ func (i *intentInterleavingIter) SeekLT(key MVCCKey) { var intentSeekKey roachpb.Key if key.Timestamp.IsEmpty() { // Common case. - intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key, i.intentKeyBuf) + intentSeekKey, i.intentSeekKeyBuf = keys.LockTableSingleKey(key.Key, i.intentSeekKeyBuf) } else { // Seeking to a specific version, so need to see the intent. Since we need // to see the intent for key.Key, and we don't have SeekLE, call Next() on // the key before doing SeekLT. - intentSeekKey, i.intentKeyBuf = keys.LockTableSingleKey(key.Key.Next(), i.intentKeyBuf) - } - var limitKey roachpb.Key - if i.iterValid { - limitKey = i.makeLowerLimitKey() + // TODO: we will need to bar this case if it misses the latest version, + // since we cannot initialize i.largestVersionSeen which is needed to + // reduce the unknown disposition intent population to at most 1. It is + // also not clear why this would be beneficial to a caller. The only + // current use of SeekLT with a non-zero timestamp is in + // MVCCGarbageCollect, where the caller wants to find the version that is + // the one step younger than the GC key, and has already forward iterated + // over many versions -- so in this case this SeekLT will not miss the + // latest version. As a hack we throw a runtime error in this case. + intentSeekKey, i.intentSeekKeyBuf = keys.LockTableSingleKey(key.Key.Next(), i.intentSeekKeyBuf) } - iterState, err := i.intentIter.SeekEngineKeyLTWithLimit(EngineKey{Key: intentSeekKey}, limitKey) + i.makeLowerLimitKey() + iterState, err := i.intentIter.SeekEngineKeyLTWithLimit( + EngineKey{Key: intentSeekKey}, i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } - i.computePos() + i.computePos(true) } func (i *intentInterleavingIter) Prev() { @@ -800,27 +1073,27 @@ func (i *intentInterleavingIter) Prev() { if err := i.tryDecodeKey(); err != nil { return } - var limitKey roachpb.Key - if i.iterValid { - limitKey = i.makeLowerLimitKey() - } - iterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey) + i.makeLowerLimitKey() + iterState, err := i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } - i.computePos() + i.computePos(false) return } // At least one of the iterators is not exhausted. if isCurAtIntent { - // iter is after the intentIter, so must be at the provisional value. - // Step it backward. It will now point to a key that is before the - // intent key. Note that the code below does not specifically care if a + // intentIter has moved beyond the intent to the next roachpb.Key. iter + // must be at the provisional value corresponding to intentKey. + // Step both backward: intentIter must be stepped back numIntents times. + // + // Note that the code below does not specifically care if a // bug (external to this code) violates the invariant that the // provisional value is the highest timestamp key, but it does care that // there is a timestamped value for this key (which it checks below). // The internal invariant of this iterator implementation will ensure // that iter is pointing to the highest timestamped key. + if i.intentCmp != 0 { i.err = errors.Errorf("iter not at provisional value, cmp: %d", i.intentCmp) i.valid = false @@ -839,75 +1112,83 @@ func (i *intentInterleavingIter) Prev() { return } } + i.makeLowerLimitKey() + var iterState pebble.IterValidityState + var err error + for j := 0; j < i.numIntents; j++ { + iterState, err = i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) + } + if err := i.tryDecodeLockKey(iterState, err); err != nil { + return + } + // At the last intent for that key. + if i.intentIterKey == nil { + i.err = errors.Errorf("intent not found") + i.valid = false + return + } + // One before the intent since we have consumed all of them to construct + // i.mvccMetadata. + iterState, err = i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) + if err := i.tryDecodeLockKey(iterState, err); err != nil { + return + } } else { // The intentIter is after the iter. We don't know whether the iter key - // has an intent. Note that the iter could itself be positioned at an - // intent. - limitKey := i.makeLowerLimitKey() - iterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey) + // has an intent. Either way, stepping the intentIter back will make it + // look similar to how backwards iteration would be positioned when the + // interleaving iter is at a versioned key. + i.makeLowerLimitKey() + iterState, err := i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } - if i.intentKey == nil { + if i.intentIterKey == nil { i.intentCmp = -1 } else { - i.intentCmp = i.intentKey.Compare(i.iterKey.Key) + i.intentCmp = i.intentIterKey.Compare(i.iterKey.Key) } } } if !i.valid { return } - if i.intentCmp > 0 { - // The iterator is positioned at an intent in intentIter, and iter is + if i.isCurAtIntentIter() { + // intentIter must be positioned preceding the intent, and iter is // exhausted or positioned at a versioned value of a preceding key. - // Stepping intentIter backward will ensure that intentKey is <= the key - // of iter (when neither is exhausted). - var limitKey roachpb.Key - if i.iterValid { - limitKey = i.makeLowerLimitKey() - } - intentIterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey) - if err = i.tryDecodeLockKey(intentIterState, err); err != nil { - return - } - if !i.iterValid { - // It !i.iterValid, the intentIter can no longer be valid either. - // Note that limitKey is nil in this case. - if intentIterState != pebble.IterExhausted { - i.err = errors.Errorf("reverse iteration discovered intent without provisional value") - } - i.valid = false - return - } - // iterValid == true. So positioned at iter. - i.intentCmp = -1 - if i.intentKey != nil { - i.intentCmp = i.intentKey.Compare(i.iterKey.Key) - if i.intentCmp > 0 { - i.err = errors.Errorf("intentIter should not be after iter") - i.valid = false + i.mvccMetadata = nil + if i.intentIterState == pebble.IterAtLimit { + // TODO(sumeer): could avoid doing this if i.iter has stepped to + // different version of same key. + i.makeLowerLimitKey() + iterState, err := i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) + if err = i.tryDecodeLockKey(iterState, err); err != nil { return } } + i.computePos(false) } else { // Common case: - // The iterator is positioned at iter. It could be a value or an intent, - // though usually it will be a value. + // The iterator is positioned at iter, i.e., at a versioned value. + + // There may be intents for this key, though they may be simple-committed, + // since we have not read them and figured out their disposition yet. + // Stash the largestVersionSeen, so that it can be used in the future. + i.largestVersionSeen = i.iterKey.Timestamp i.iter.Prev() if err := i.tryDecodeKey(); err != nil { return } - if i.intentIterState == pebble.IterAtLimit && i.iterValid { + if i.intentIterState == pebble.IterAtLimit { // TODO(sumeer): could avoid doing this if i.iter has stepped to // different version of same key. - limitKey := i.makeLowerLimitKey() - iterState, err := i.intentIter.PrevEngineKeyWithLimit(limitKey) + i.makeLowerLimitKey() + iterState, err := i.intentIter.PrevEngineKeyWithLimit(i.lastLimitKey) if err = i.tryDecodeLockKey(iterState, err); err != nil { return } } - i.computePos() + i.computePos(false) } } @@ -920,18 +1201,6 @@ func (i *intentInterleavingIter) UnsafeRawKey() []byte { func (i *intentInterleavingIter) UnsafeRawMVCCKey() []byte { if i.isCurAtIntentIter() { - if i.intentKeyAsNoTimestampMVCCKey == nil { - // Slow-path: tryDecodeLockKey was not able to initialize. - if cap(i.intentKeyAsNoTimestampMVCCKeyBacking) < len(i.intentKey)+1 { - i.intentKeyAsNoTimestampMVCCKeyBacking = make([]byte, 0, len(i.intentKey)+1) - } - i.intentKeyAsNoTimestampMVCCKeyBacking = append( - i.intentKeyAsNoTimestampMVCCKeyBacking[:0], i.intentKey...) - // Append the 0 byte representing the absence of a timestamp. - i.intentKeyAsNoTimestampMVCCKeyBacking = append( - i.intentKeyAsNoTimestampMVCCKeyBacking, 0) - i.intentKeyAsNoTimestampMVCCKey = i.intentKeyAsNoTimestampMVCCKeyBacking - } return i.intentKeyAsNoTimestampMVCCKey } return i.iter.UnsafeRawKey() @@ -961,7 +1230,7 @@ func (i *intentInterleavingIter) SetUpperBound(key roachpb.Key) { i.checkConstraint(key, true) } var intentUpperBound roachpb.Key - intentUpperBound, i.intentKeyBuf = keys.LockTableSingleKey(key, i.intentKeyBuf) + intentUpperBound, i.intentSeekKeyBuf = keys.LockTableSingleKey(key, i.intentSeekKeyBuf) i.intentIter.SetUpperBound(intentUpperBound) } diff --git a/pkg/storage/intent_interleaving_iter_test.go b/pkg/storage/intent_interleaving_iter_test.go index 6d28cbe95e01..e8e1ae1a1463 100644 --- a/pkg/storage/intent_interleaving_iter_test.go +++ b/pkg/storage/intent_interleaving_iter_test.go @@ -16,6 +16,7 @@ import ( "fmt" "math/rand" "sort" + "strconv" "strings" "testing" @@ -332,7 +333,24 @@ func TestIntentInterleavingIter(t *testing.T) { if d.HasArg("prefix") { d.ScanArgs(t, "prefix", &opts.Prefix) } - iter := wrapInUnsafeIter(newIntentInterleavingIterator(eng, opts)) + var committedTxns *commitMap + if d.HasArg("commit-map") { + var commitTxnsStr string + d.ScanArgs(t, "commit-map", &commitTxnsStr) + txns := strings.Split(commitTxnsStr, ",") + committedTxns = &commitMap{ + simpleCommits: make(map[uuid.UUID]struct{}), + } + for i := range txns { + txn, err := strconv.Atoi(txns[i]) + require.NoError(t, err) + txnUUID := uuid.FromUint128(uint128.FromInts(0, uint64(txn))) + committedTxns.simpleCommits[txnUUID] = struct{}{} + } + } + iiter := newIntentInterleavingIterator(eng, opts) + iiter.(*intentInterleavingIter).commitMap = committedTxns + iter := wrapInUnsafeIter(iiter) var b strings.Builder defer iter.Close() // pos is the original : prefix computed by @@ -724,6 +742,8 @@ func doOps(t *testing.T, ops []string, eng Engine, interleave bool, out *strings } } +// TODO: fix test. + var seedFlag = flag.Int64("seed", -1, "specify seed to use for random number generator") func TestRandomizedIntentInterleavingIter(t *testing.T) { diff --git a/pkg/storage/testdata/intent_interleaving_iter/basic b/pkg/storage/testdata/intent_interleaving_iter/basic index 6f1f3c44f47e..3533201d5962 100644 --- a/pkg/storage/testdata/intent_interleaving_iter/basic +++ b/pkg/storage/testdata/intent_interleaving_iter/basic @@ -1,11 +1,11 @@ -# Both separated and interleaved intents, and one inline meta. +# Separated intents, and one inline meta. define locks meta k=a ts=20 txn=1 +meta k=b ts=30 txn=2 mvcc value k=a ts=20 v=a20 value k=a ts=10 v=a10 -meta k=b ts=30 txn=2 value k=b ts=30 v=b30 meta k=c value k=d ts=25 v=d25 @@ -18,7 +18,7 @@ value k=d ts=25 v=d25 # iterators. # - The second stats call below (after some prev steps) shows a higher value # of interface reverse steps compared to internal reverse steps. This is -# because the intent iterator is being called with PrevWithLimit ans is +# because the intent iterator is being called with PrevWithLimit and is # already at the separated intent "a", so does not have to step the internal # iterator. iter lower=a upper=f @@ -65,7 +65,7 @@ next: output: value k=b ts=30.000000000,0 v=b30 next: output: meta k=c next: output: value k=d ts=25.000000000,0 v=d25 next: output: . -stats: (interface (dir, seek, step): (fwd, 2, 7), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 7), (rev, 0, 0)) +stats: (interface (dir, seek, step): (fwd, 2, 9), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 7), (rev, 0, 0)) prev: output: value k=d ts=25.000000000,0 v=d25 prev: output: meta k=c prev: output: value k=b ts=30.000000000,0 v=b30 @@ -74,17 +74,17 @@ prev: output: value k=a ts=10.000000000,0 v=a10 prev: output: value k=a ts=20.000000000,0 v=a20 prev: output: meta k=a ts=20.000000000,0 txn=1 prev: output: . -stats: (interface (dir, seek, step): (fwd, 2, 7), (rev, 0, 13)), (internal (dir, seek, step): (fwd, 2, 7), (rev, 2, 7)) +stats: (interface (dir, seek, step): (fwd, 2, 9), (rev, 0, 12)), (internal (dir, seek, step): (fwd, 2, 7), (rev, 2, 7)) set-upper c seek-ge "b"/0,0: output: meta k=b ts=30.000000000,0 txn=2 -stats: (interface (dir, seek, step): (fwd, 4, 7), (rev, 0, 13)), (internal (dir, seek, step): (fwd, 4, 7), (rev, 2, 7)) +stats: (interface (dir, seek, step): (fwd, 4, 10), (rev, 0, 12)), (internal (dir, seek, step): (fwd, 4, 8), (rev, 2, 7)) next: output: value k=b ts=30.000000000,0 v=b30 next: output: . prev: output: value k=b ts=30.000000000,0 v=b30 prev: output: meta k=b ts=30.000000000,0 txn=2 prev: output: value k=a ts=10.000000000,0 v=a10 seek-lt "b"/0,0: output: value k=a ts=10.000000000,0 v=a10 -stats: (interface (dir, seek, step): (fwd, 4, 9), (rev, 2, 19)), (internal (dir, seek, step): (fwd, 4, 9), (rev, 6, 13)) +stats: (interface (dir, seek, step): (fwd, 4, 11), (rev, 2, 17)), (internal (dir, seek, step): (fwd, 4, 9), (rev, 6, 13)) next: output: meta k=b ts=30.000000000,0 txn=2 prev: output: value k=a ts=10.000000000,0 v=a10 prev: output: value k=a ts=20.000000000,0 v=a20 @@ -160,6 +160,8 @@ next: output: . # Prefix iteration and NextKey. What we will see after the prefix is # exhausted is undefined. +# seek-ge k=a exhausts the intentIter when extracting the intent, so next or +# next-key do not see the next intent. iter prefix=true seek-ge k=d next-key @@ -172,10 +174,10 @@ next-key seek-ge "d"/0,0: output: value k=d ts=25.000000000,0 v=d25 next-key: output: . seek-ge "a"/0,0: output: meta k=a ts=20.000000000,0 txn=1 -next-key: output: meta k=b ts=30.000000000,0 txn=2 +next-key: output: value k=b ts=30.000000000,0 v=b30 seek-ge "a"/0,0: output: meta k=a ts=20.000000000,0 txn=1 next: output: value k=a ts=20.000000000,0 v=a20 -next-key: output: meta k=b ts=30.000000000,0 txn=2 +next-key: output: value k=b ts=30.000000000,0 v=b30 # Seek to particular timestamp. iter lower=a upper=f @@ -198,6 +200,7 @@ prev prev next seek-lt k=a ts=25 +seek-lt k=a ts=19 prev next seek-ge k=a ts=5 @@ -205,6 +208,7 @@ next next prev seek-lt k=b ts=40 +seek-lt k=b ts=19 prev prev prev @@ -228,18 +232,20 @@ seek-lt "a"/15.000000000,0: output: value k=a ts=20.000000000,0 v=a20 prev: output: meta k=a ts=20.000000000,0 txn=1 prev: output: . next: output: meta k=a ts=20.000000000,0 txn=1 -seek-lt "a"/25.000000000,0: output: meta k=a ts=20.000000000,0 txn=1 -prev: output: . -next: output: meta k=a ts=20.000000000,0 txn=1 +seek-lt "a"/25.000000000,0: output: err: SeekLT cannot be used to skip the latest version and land on intent +seek-lt "a"/19.000000000,0: output: value k=a ts=20.000000000,0 v=a20 +prev: output: meta k=a ts=20.000000000,0 txn=1 +next: output: value k=a ts=20.000000000,0 v=a20 seek-ge "a"/5.000000000,0: output: meta k=b ts=30.000000000,0 txn=2 next: output: value k=b ts=30.000000000,0 v=b30 next: output: meta k=c prev: output: value k=b ts=30.000000000,0 v=b30 -seek-lt "b"/40.000000000,0: output: meta k=b ts=30.000000000,0 txn=2 +seek-lt "b"/40.000000000,0: output: err: SeekLT cannot be used to skip the latest version and land on intent +seek-lt "b"/19.000000000,0: output: value k=b ts=30.000000000,0 v=b30 +prev: output: meta k=b ts=30.000000000,0 txn=2 prev: output: value k=a ts=10.000000000,0 v=a10 prev: output: value k=a ts=20.000000000,0 v=a20 -prev: output: meta k=a ts=20.000000000,0 txn=1 -next: output: value k=a ts=20.000000000,0 v=a20 +next: output: value k=a ts=10.000000000,0 v=a10 # Seek to particular timestamp and prefix iteration. What we will # see after the prefix is exhausted is undefined. @@ -293,17 +299,17 @@ next-key: output: value k=d ts=25.000000000,0 v=d25 next-key: output: . -# Multiple separated intents and multiple interleaved intents. +# Multiple separated intents. define locks +meta k=a ts=10 txn=1 meta k=b ts=20 txn=2 +meta k=c ts=30 txn=3 meta k=d ts=40 txn=4 meta k=e ts=50 txn=5 mvcc -meta k=a ts=10 txn=1 value k=a ts=10 v=a10 value k=b ts=20 v=b20 -meta k=c ts=30 txn=3 value k=c ts=30 v=c30 value k=d ts=40 v=d40 value k=e ts=50 v=e50 @@ -386,24 +392,24 @@ prev seek-lt k=e prev ---- -seek-ge "a"/0,0: output: meta k=b ts=20.000000000,0 txn=2 -next: output: err: intentIter at intent, but iter not at provisional value -seek-lt "e"/0,0: output: meta k=d ts=40.000000000,0 txn=4 -next: output: err: intent has no provisional value -seek-ge "d"/0,0: output: meta k=d ts=40.000000000,0 txn=4 -next-key: output: err: intentIter at intent, but iter not at provisional value -seek-ge "d"/0,0: output: meta k=d ts=40.000000000,0 txn=4 -prev: output: err: iter not at provisional value, cmp: -1 -seek-lt "e"/0,0: output: meta k=d ts=40.000000000,0 txn=4 -prev: output: err: reverse iteration discovered intent without provisional value +seek-ge "a"/0,0: output: err: intent with no version +next: output: err: intent with no version +seek-lt "e"/0,0: output: err: SeekLT cannot be used to skip the latest version and land on intent +next: output: err: SeekLT cannot be used to skip the latest version and land on intent +seek-ge "d"/0,0: output: err: intent with no version +next-key: output: err: intent with no version +seek-ge "d"/0,0: output: err: intent with no version +prev: output: err: intent with no version +seek-lt "e"/0,0: output: err: SeekLT cannot be used to skip the latest version and land on intent +prev: output: err: SeekLT cannot be used to skip the latest version and land on intent # Local range keys. This exercises local keys having separated locks. define locks +meta k=La ts=10 txn=1 meta k=Lb ts=20 txn=2 meta k=Lc ts=30 txn=4 mvcc -meta k=La ts=10 txn=1 value k=La ts=10 v=a10 value k=Lb ts=20 v=b20 value k=Lc ts=30 v=c30 @@ -523,12 +529,7 @@ seek-ge "Lb"/25.000000000,0: output: value k=Lb ts=20.000000000,0 v=b20 next: output: . seek-ge "Lc"/25.000000000,0: output: . -# Keys with \x00 byte. To exercise the slow-path in UnsafeRawMVCCKey. The keys -# that are length 8, 16 will exercise the slow-path. The len(key) < 8 does -# not. DecodeLockTableSingleKey allocates a new slice for all these keys, due -# to the escaping. However the starting capacity is 8, and the next growth -# step is 16, which means that when len(key) < 8, the len of the allocated -# slice is smaller than cap and the first byte beyond len is 0. +# Keys with \x00 byte. define locks meta k=abcdefg\0 ts=20 txn=1 @@ -593,12 +594,12 @@ prev: output: . define locks +meta k=La ts=10 txn=1 meta k=Lb ts=20 txn=2 meta k=Lc ts=30 txn=4 meta k=b ts=40 txn=5 meta k=d ts=50 txn=6 mvcc -meta k=La ts=10 txn=1 value k=La ts=10 v=a10 value k=Lb ts=20 v=b20 value k=Lc ts=30 v=c30 diff --git a/pkg/storage/testdata/intent_interleaving_iter/error_race b/pkg/storage/testdata/intent_interleaving_iter/error_race index ba28e02ded78..bd0771be002e 100644 --- a/pkg/storage/testdata/intent_interleaving_iter/error_race +++ b/pkg/storage/testdata/intent_interleaving_iter/error_race @@ -3,11 +3,12 @@ # when RaceEnabled=true. This file contains the output when # RaceEnabled=true -# Error case: Multiple intents for a key +# Error case: Multiple intents for a key but only one has corresponding +# version. define locks meta k=a ts=10 txn=1 -meta k=b ts=20 txn=2 +meta k=b ts=19 txn=2 meta k=b ts=20 txn=4 meta k=c ts=30 txn=4 mvcc @@ -16,6 +17,8 @@ value k=b ts=20 v=b20 value k=c ts=30 v=c30 ---- +# We don't catch the error on forward iteration, since we forget the simple-committed +# intent. iter lower=a upper=d seek-ge k=a next @@ -32,11 +35,11 @@ prev seek-ge "a"/0,0: output: meta k=a ts=10.000000000,0 txn=1 next: output: value k=a ts=10.000000000,0 v=a10 next: output: meta k=b ts=20.000000000,0 txn=4 -next: output: err: intentIter incorrectly positioned, cmp: 0 -next: output: err: intentIter incorrectly positioned, cmp: 0 -next: output: err: intentIter incorrectly positioned, cmp: 0 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: meta k=c ts=30.000000000,0 txn=4 +next: output: value k=c ts=30.000000000,0 v=c30 seek-lt "d"/0,0: output: value k=c ts=30.000000000,0 v=c30 prev: output: meta k=c ts=30.000000000,0 txn=4 prev: output: value k=b ts=20.000000000,0 v=b20 -prev: output: meta k=b ts=20.000000000,0 txn=2 -prev: output: err: intentIter should not be after iter +prev: output: meta k=b ts=20.000000000,0 txn=4 +prev: output: value k=a ts=10.000000000,0 v=a10 diff --git a/pkg/storage/testdata/intent_interleaving_iter/error_race_off b/pkg/storage/testdata/intent_interleaving_iter/error_race_off index 67a7aa7e4799..7e835c1f964c 100644 --- a/pkg/storage/testdata/intent_interleaving_iter/error_race_off +++ b/pkg/storage/testdata/intent_interleaving_iter/error_race_off @@ -3,11 +3,12 @@ # when RaceEnabled=true. This file contains the output when # RaceEnabled=false -# Error case: Multiple intents for a key +# Error case: Multiple intents for a key but only one has corresponding +# version. define locks meta k=a ts=10 txn=1 -meta k=b ts=20 txn=2 +meta k=b ts=19 txn=2 meta k=b ts=20 txn=4 meta k=c ts=30 txn=4 mvcc @@ -16,10 +17,8 @@ value k=b ts=20 v=b20 value k=c ts=30 v=c30 ---- -# We don't catch the error immediately on forward iteration, since unnecessary -# key comparisons are elided (except for RaceEnabled=true). Which is why the -# intent for k=b is returned twice. But continued forward iteration eventually -# catches it. +# We don't catch the error on forward iteration, since we forget the simple-committed +# intent. iter lower=a upper=d seek-ge k=a next @@ -37,10 +36,12 @@ seek-ge "a"/0,0: output: meta k=a ts=10.000000000,0 txn=1 next: output: value k=a ts=10.000000000,0 v=a10 next: output: meta k=b ts=20.000000000,0 txn=4 next: output: value k=b ts=20.000000000,0 v=b20 -next: output: meta k=b ts=20.000000000,0 txn=2 -next: output: err: intentIter at intent, but iter not at provisional value +next: output: meta k=c ts=30.000000000,0 txn=4 +next: output: value k=c ts=30.000000000,0 v=c30 seek-lt "d"/0,0: output: value k=c ts=30.000000000,0 v=c30 prev: output: meta k=c ts=30.000000000,0 txn=4 prev: output: value k=b ts=20.000000000,0 v=b20 -prev: output: meta k=b ts=20.000000000,0 txn=2 -prev: output: err: intentIter should not be after iter +prev: output: meta k=b ts=20.000000000,0 txn=4 +prev: output: value k=a ts=10.000000000,0 v=a10 + +# TODO: add more tests that differ depending on RaceEnabled value diff --git a/pkg/storage/testdata/intent_interleaving_iter/multi_intents b/pkg/storage/testdata/intent_interleaving_iter/multi_intents new file mode 100644 index 000000000000..09ef0b9683d5 --- /dev/null +++ b/pkg/storage/testdata/intent_interleaving_iter/multi_intents @@ -0,0 +1,256 @@ +define +locks +meta k=a ts=10 txn=1 +meta k=b ts=15 txn=2 +meta k=b ts=25 txn=3 +meta k=b ts=20 txn=4 +meta k=c ts=30 txn=5 +mvcc +value k=a ts=10 v=a10 +value k=b ts=25 v=b25 +value k=b ts=20 v=b20 +value k=b ts=15 v=b15 +value k=c ts=30 v=c30 +---- + +iter lower=a upper=d +seek-ge k=a +next +next +next +next +next +next +next +next +seek-lt k=d +prev +prev +prev +prev +prev +prev +prev +prev +---- +seek-ge "a"/0,0: output: meta k=a ts=10.000000000,0 txn=1 +next: output: value k=a ts=10.000000000,0 v=a10 +next: output: meta k=b ts=25.000000000,0 txn=3 +next: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: meta k=c ts=30.000000000,0 txn=5 +next: output: value k=c ts=30.000000000,0 v=c30 +next: output: . +seek-lt "d"/0,0: output: value k=c ts=30.000000000,0 v=c30 +prev: output: meta k=c ts=30.000000000,0 txn=5 +prev: output: value k=b ts=15.000000000,0 v=b15 +prev: output: value k=b ts=20.000000000,0 v=b20 +prev: output: value k=b ts=25.000000000,0 v=b25 +prev: output: meta k=b ts=25.000000000,0 txn=3 +prev: output: value k=a ts=10.000000000,0 v=a10 +prev: output: meta k=a ts=10.000000000,0 txn=1 +prev: output: . + +iter lower=a upper=d +seek-ge k=b +next +next +next +prev +prev +prev +prev +prev +prev +seek-ge k=b +next-key +prev +prev +prev +prev +prev +prev +prev +seek-lt k=b ts=25 +seek-lt k=b ts=24 +next +next +next +seek-lt k=b ts=24 +prev +next +next +next +next +next +---- +seek-ge "b"/0,0: output: meta k=b ts=25.000000000,0 txn=3 +next: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +prev: output: value k=b ts=20.000000000,0 v=b20 +prev: output: value k=b ts=25.000000000,0 v=b25 +prev: output: meta k=b ts=25.000000000,0 txn=3 +prev: output: value k=a ts=10.000000000,0 v=a10 +prev: output: meta k=a ts=10.000000000,0 txn=1 +prev: output: . +seek-ge "b"/0,0: output: meta k=b ts=25.000000000,0 txn=3 +next-key: output: meta k=c ts=30.000000000,0 txn=5 +prev: output: value k=b ts=15.000000000,0 v=b15 +prev: output: value k=b ts=20.000000000,0 v=b20 +prev: output: value k=b ts=25.000000000,0 v=b25 +prev: output: meta k=b ts=25.000000000,0 txn=3 +prev: output: value k=a ts=10.000000000,0 v=a10 +prev: output: meta k=a ts=10.000000000,0 txn=1 +prev: output: . +seek-lt "b"/25.000000000,0: output: err: SeekLT cannot be used to skip the latest version and land on intent +seek-lt "b"/24.000000000,0: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: meta k=c ts=30.000000000,0 txn=5 +seek-lt "b"/24.000000000,0: output: value k=b ts=25.000000000,0 v=b25 +prev: output: meta k=b ts=25.000000000,0 txn=3 +next: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: meta k=c ts=30.000000000,0 txn=5 +next: output: value k=c ts=30.000000000,0 v=c30 + +# Intents for b are simple-committed since there is a more recent version with +# no intent. +define +locks +meta k=a ts=10 txn=1 +meta k=b ts=15 txn=2 +meta k=b ts=20 txn=4 +meta k=c ts=30 txn=5 +mvcc +value k=a ts=10 v=a10 +value k=b ts=25 v=b25 +value k=b ts=20 v=b20 +value k=b ts=15 v=b15 +value k=c ts=30 v=c30 +---- + +iter lower=a upper=d +seek-ge k=a +next +next +next +next +next +prev +prev +prev +prev +prev +prev +---- +seek-ge "a"/0,0: output: meta k=a ts=10.000000000,0 txn=1 +next: output: value k=a ts=10.000000000,0 v=a10 +next: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: meta k=c ts=30.000000000,0 txn=5 +prev: output: value k=b ts=15.000000000,0 v=b15 +prev: output: value k=b ts=20.000000000,0 v=b20 +prev: output: value k=b ts=25.000000000,0 v=b25 +prev: output: value k=a ts=10.000000000,0 v=a10 +prev: output: meta k=a ts=10.000000000,0 txn=1 +prev: output: . + +# Test that if intents vanish we are able to change direction correctly. +iter lower=a upper=d +seek-ge k=a +next +next +prev +prev +prev +seek-lt k=b ts=24 +prev +next +next +next +next +next +---- +seek-ge "a"/0,0: output: meta k=a ts=10.000000000,0 txn=1 +next: output: value k=a ts=10.000000000,0 v=a10 +next: output: value k=b ts=25.000000000,0 v=b25 +prev: output: value k=a ts=10.000000000,0 v=a10 +prev: output: meta k=a ts=10.000000000,0 txn=1 +prev: output: . +seek-lt "b"/24.000000000,0: output: value k=b ts=25.000000000,0 v=b25 +prev: output: value k=a ts=10.000000000,0 v=a10 +next: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: meta k=c ts=30.000000000,0 txn=5 +next: output: value k=c ts=30.000000000,0 v=c30 + +# Test with commitMap +iter lower=a upper=d commit-map=1,5 +seek-ge k=a +next +next +next +next +next +prev +prev +prev +prev +prev +prev +---- +seek-ge "a"/0,0: output: value k=a ts=10.000000000,0 v=a10 +next: output: value k=b ts=25.000000000,0 v=b25 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: value k=c ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=c ts=30.000000000,0 v=c30 +prev: output: value k=b ts=15.000000000,0 v=b15 +prev: output: value k=b ts=20.000000000,0 v=b20 +prev: output: value k=b ts=25.000000000,0 v=b25 +prev: output: value k=a ts=10.000000000,0 v=a10 +prev: output: . + +define +locks +meta k=a ts=10 txn=1 +meta k=b ts=15 txn=2 +meta k=b ts=20 txn=4 +meta k=c ts=30 txn=5 +mvcc +value k=a ts=10 v=a10 +value k=b ts=20 v=b20 +value k=b ts=15 v=b15 +value k=c ts=30 v=c30 +---- + +# Latest intent for b is simple-committed because of commit-map and older +# intent for is simple-committed because of being non-youngest version. +# Intents for a and c are also simple-committed because of commit-map. +iter lower=a upper=d commit-map=1,4,5 +seek-ge k=a +next +next +next +next +prev +prev +prev +prev +---- +seek-ge "a"/0,0: output: value k=a ts=10.000000000,0 v=a10 +next: output: value k=b ts=20.000000000,0 v=b20 +next: output: value k=b ts=15.000000000,0 v=b15 +next: output: value k=c ts=30.000000000,0 v=c30 +next: output: . +prev: output: value k=c ts=30.000000000,0 v=c30 +prev: output: value k=b ts=15.000000000,0 v=b15 +prev: output: value k=b ts=20.000000000,0 v=b20 +prev: output: value k=a ts=10.000000000,0 v=a10