diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go index 0c55a165f189..e170f648d630 100644 --- a/pkg/storage/pebble_mvcc_scanner.go +++ b/pkg/storage/pebble_mvcc_scanner.go @@ -309,6 +309,11 @@ func extractResultKey(repr []byte) roachpb.Key { // by switching to a pointSynthesizingIter. type pebbleMVCCScanner struct { parent MVCCIterator + // parentReverse is true if the previous parent positioning operation was a + // reverse operation (SeekLT or Prev). This is needed to correctly initialize + // pointIter when encountering an MVCC range tombstone in reverse. + // TODO(erikgrinaker): Consider adding MVCCIterator.IsReverse() instead. + parentReverse bool // pointIter is a point synthesizing iterator that wraps and replaces parent // when an MVCC range tombstone is encountered. A separate reference to it is // kept in order to release it back to its pool when the scanner is done. @@ -447,10 +452,13 @@ func (p *pebbleMVCCScanner) get(ctx context.Context) { // HasPointAndRange() call. if ok, _ := p.parent.Valid(); ok { if _, hasRange := p.parent.HasPointAndRange(); hasRange { - p.enablePointSynthesis() + if !p.enablePointSynthesis() { + return + } } } + p.parentReverse = false p.parent.SeekGE(MVCCKey{Key: p.start}) if !p.updateCurrent() { return @@ -477,7 +485,9 @@ func (p *pebbleMVCCScanner) scan( // HasPointAndRange() call. if ok, _ := p.parent.Valid(); ok { if _, hasRange := p.parent.HasPointAndRange(); hasRange { - p.enablePointSynthesis() + if !p.enablePointSynthesis() { + return nil, 0, 0, p.err + } } } @@ -959,6 +969,7 @@ func (p *pebbleMVCCScanner) advanceKeyAtEnd() bool { // Iterating to the next key might have caused the iterator to reach the // end of the key space. If that happens, back up to the very last key. p.peeked = false + p.parentReverse = true p.parent.SeekLT(MVCCKey{Key: p.end}) if !p.updateCurrent() { return false @@ -1199,8 +1210,8 @@ func (p *pebbleMVCCScanner) updateCurrent() bool { // synthesizes MVCC point tombstones for MVCC range tombstones and never emits // range keys itself. p.parent must be valid. // -// gcassert:inline -func (p *pebbleMVCCScanner) enablePointSynthesis() { +// Returns true if the iterator is valid. +func (p *pebbleMVCCScanner) enablePointSynthesis() bool { if util.RaceEnabled { if ok, _ := p.parent.Valid(); !ok { panic(errors.AssertionFailedf("enablePointSynthesis called with invalid iter")) @@ -1213,13 +1224,18 @@ func (p *pebbleMVCCScanner) enablePointSynthesis() { p.parent.UnsafeKey())) } } - p.pointIter = NewPointSynthesizingIterAtParent(p.parent) - p.parent = p.pointIter + pointIter, err := NewPointSynthesizingIterAtParent(p.parent, p.parentReverse) + if err != nil { + p.err = err + return false + } if util.RaceEnabled { - if ok, _ := p.parent.Valid(); !ok { + if ok, _ := pointIter.Valid(); !ok { panic(errors.AssertionFailedf("invalid pointSynthesizingIter with valid iter")) } } + p.pointIter, p.parent = pointIter, pointIter + return true } func (p *pebbleMVCCScanner) decodeCurrentMetadata() bool { @@ -1260,7 +1276,9 @@ func (p *pebbleMVCCScanner) iterValid() bool { // Since iterValid() is called after every iterator positioning operation, it // is convenient to check for any range keys and enable point synthesis here. if p.parent.RangeKeyChanged() { - p.enablePointSynthesis() + if !p.enablePointSynthesis() { + return false + } } if util.RaceEnabled && p.pointIter == nil { if _, hasRange := p.parent.HasPointAndRange(); hasRange { @@ -1273,6 +1291,7 @@ func (p *pebbleMVCCScanner) iterValid() bool { // iterSeek seeks to the latest revision of the specified key (or a greater key). func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool { + p.parentReverse = false p.clearPeeked() p.parent.SeekGE(key) return p.updateCurrent() @@ -1280,6 +1299,7 @@ func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool { // iterSeekReverse seeks to the latest revision of the key before the specified key. func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool { + p.parentReverse = true p.clearPeeked() p.parent.SeekLT(key) if !p.updateCurrent() { @@ -1298,6 +1318,7 @@ func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool { // iterNext advances to the next MVCC key. func (p *pebbleMVCCScanner) iterNext() bool { + p.parentReverse = false if p.reverse && p.peeked { // If we have peeked at the previous entry, we need to advance the iterator // twice. @@ -1323,6 +1344,7 @@ func (p *pebbleMVCCScanner) iterNext() bool { // iterPrev advances to the previous MVCC Key. func (p *pebbleMVCCScanner) iterPrev() bool { + p.parentReverse = true if p.peeked { p.peeked = false return p.updateCurrent() @@ -1337,6 +1359,7 @@ func (p *pebbleMVCCScanner) iterPrev() bool { func (p *pebbleMVCCScanner) iterPeekPrev() ([]byte, bool) { if !p.peeked { p.peeked = true + p.parentReverse = true // We need to save a copy of the current iterator key and value and adjust // curRawKey, curKey and curValue to point to this saved data. We use a // single buffer for this purpose: savedBuf. diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index f9f8d371d509..ed34ca4a68bf 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -147,18 +147,63 @@ func NewPointSynthesizingIter(parent MVCCIterator) *PointSynthesizingIter { // NewPointSynthesizingIterAtParent creates a new pointSynthesizingIter and // loads the position from the parent iterator. -func NewPointSynthesizingIterAtParent(parent MVCCIterator) *PointSynthesizingIter { +// +// If reverse is true, the point synthesizing iterator will reposition on +// a synthetic point tombstone after the current position if appropriate. +// Consider the following dataset: +// +// 2 a2 b2 +// 1 [---) +// a b +// +// If the caller was positioned on b@2, then called Prev() and landed on a@2, +// where it detected the [a-b)@2 range key and enabled point synthesis, it would +// appear to have skipped over the synthetic point tombstone at a@1. Setting +// reverse=true in this case will position the iterator on the synthetic point +// tombstone a@1. +// +// However, the above assumes that the previous positioning operation was from a +// different key -- if it was e.g. a SeekLT(a@1) then this repositioning will be +// incorrect. The caller must take care to use reverse only when appropriate. +func NewPointSynthesizingIterAtParent( + parent MVCCIterator, reverse bool, +) (*PointSynthesizingIter, error) { iter := NewPointSynthesizingIter(parent) iter.rangeKeyChanged = true // force range key detection - if ok, err := iter.updateIter(); ok && err == nil { + if ok, _ := iter.updateIter(); ok { // updateSeekGEPosition may step parent and then compare against seekKey, so // we need to clone it. seekKey := parent.UnsafeKey() iter.seekKeyBuf = append(iter.seekKeyBuf[:0], seekKey.Key...) seekKey.Key = iter.seekKeyBuf iter.updateSeekGEPosition(seekKey) + if err := iter.iterErr; err != nil { + iter.release() + return nil, err + } + + // If we're on the start key of a range key in the reverse direction, we + // must take care not to skip any synthetic point tombstones that the caller + // would have expected to be emitted. This essentially means switching the + // iterator to reverse iteration and repositioning on the oldest version + // (real or synthetic). + if reverse && len(iter.rangeKeys) > 0 && iter.rangeKeysPos.Equal(iter.rangeKeysStart) { + if !iter.atPoint { + if _, err := iter.iterPrev(); err != nil { + iter.release() + return nil, err + } + } + iter.reverse = true + iter.rangeKeysIdx = iter.rangeKeysEnd - 1 + iter.updateAtPoint() + } } - return iter + if err := iter.iterErr; err != nil { + iter.release() + return nil, err + } + return iter, nil } // Close implements MVCCIterator. diff --git a/pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression b/pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression new file mode 100644 index 000000000000..f2fcba081bd9 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression @@ -0,0 +1,180 @@ +# Regression test for https://github.com/cockroachdb/cockroach/issues/90642. +# +# REAL DATASET SYNTHETIC DATASET +# 3 [b3] 3 [b3] +# 2 a2 2 a2 +# 1 [---) 1 x +# a b a b +# +# Recall that pebbleMVCCScanner only enables pointSynthesizingIter when it +# encounters a range key. In the case above, during a reverse scan, the [a-b)@1 +# range key will first become visible to pebbleMVCCScanner when it lands on a@2, +# so it enabled point synthesis positioned at the a@2 point key. Notice how the +# iterator has now skipped over the synthetic point tombstone a@1. +# +# This is particularly problematic when combined with pebbleMVCCScanner peeking, +# which assumes that following a iterPeekPrev() call, an iterNext() call can +# step the parent iterator forward once to get back to the original position. +# With the above bug, that is no longer true, as it instead lands on the +# synthetic point tombstone which was skipped during reverse iteration. During +# intent processing for b@3, such an iterNext() call is expected to land on the +# intent's provisional value at b@3, but it instead lands on the intent itself +# at b@0. This in turn caused a value checksum or decoding failure, where it was +# expecting the current key to be b@3, but the actual key was b@0. +run ok +del_range_ts k=a end=b ts=1 +put k=a ts=2 v=a2 +with t=A + txn_begin k=b ts=3 + put k=b v=b3 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[1.000000000,0=/] +data: "a"/2.000000000,0 -> /BYTES/a2 +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} ts=3.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/3.000000000,0 -> /BYTES/b3 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b3 @3.000000000,0 +scan: "a" -> /BYTES/a2 @2.000000000,0 + +# We also test the same scenario with a double range tombstone, i.e.: +# +# 3 [b3] +# 2 [---) +# 1 [---) +# a b +run ok +txn_remove t=A +clear_range k=a end=z +del_range_ts k=a end=b ts=1 +del_range_ts k=a end=b ts=2 +with t=A + txn_begin k=b ts=3 + put k=b v=b3 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[2.000000000,0=/ 1.000000000,0=/] +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} ts=3.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/3.000000000,0 -> /BYTES/b3 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b3 @3.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones +---- +scan: "b" -> /BYTES/b3 @3.000000000,0 +scan: "a" -> / @2.000000000,0 + +# And with a point between the range tombstone. We place the intent at the +# lowest timestamp, which is a contrived/unrealistic scenario, and do tombstone +# scans at all timestamps. +# +# 3 [---) +# 3 a3 +# 2 [---) +# 1 [b1] +# a b +run ok +txn_remove t=A +clear_range k=a end=z +del_range_ts k=a end=b ts=2 +put k=a ts=3 v=a3 +del_range_ts k=a end=b ts=4 +with t=A + txn_begin k=b ts=1 + put k=b v=b1 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[4.000000000,0=/ 2.000000000,0=/] +data: "a"/3.000000000,0 -> /BYTES/a3 +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} ts=1.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/1.000000000,0 -> /BYTES/b1 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=4 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @4.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=3 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> /BYTES/a3 @3.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=2 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @2.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=1 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 + +# And a point below them. +# +# 4 [b4] +# 3 [---) +# 2 [---) +# 1 a1 +# a b +run ok +txn_remove t=A +clear_range k=a end=z +put k=a ts=2 v=a2 +del_range_ts k=a end=b ts=3 +del_range_ts k=a end=b ts=4 +with t=A + txn_begin k=b ts=1 + put k=b v=b1 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[4.000000000,0=/ 3.000000000,0=/] +data: "a"/2.000000000,0 -> /BYTES/a2 +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} ts=1.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/1.000000000,0 -> /BYTES/b1 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=4 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @4.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=3 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @3.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=2 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> /BYTES/a2 @2.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=1 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0