From 643c54f30ab0f14b17ebb64a7554d26c432e2266 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 29 Oct 2022 14:25:29 +0000 Subject: [PATCH 1/3] storage: handle errors in `pebbleMVCCScanner.enablePointSynthesis` Release note: None --- pkg/storage/pebble_mvcc_scanner.go | 27 ++++++++++++++++++-------- pkg/storage/point_synthesizing_iter.go | 10 +++++++--- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go index 0c55a165f189..29934eb07c14 100644 --- a/pkg/storage/pebble_mvcc_scanner.go +++ b/pkg/storage/pebble_mvcc_scanner.go @@ -447,7 +447,9 @@ func (p *pebbleMVCCScanner) get(ctx context.Context) { // HasPointAndRange() call. if ok, _ := p.parent.Valid(); ok { if _, hasRange := p.parent.HasPointAndRange(); hasRange { - p.enablePointSynthesis() + if !p.enablePointSynthesis() { + return + } } } @@ -477,7 +479,9 @@ func (p *pebbleMVCCScanner) scan( // HasPointAndRange() call. if ok, _ := p.parent.Valid(); ok { if _, hasRange := p.parent.HasPointAndRange(); hasRange { - p.enablePointSynthesis() + if !p.enablePointSynthesis() { + return nil, 0, 0, p.err + } } } @@ -1199,8 +1203,8 @@ func (p *pebbleMVCCScanner) updateCurrent() bool { // synthesizes MVCC point tombstones for MVCC range tombstones and never emits // range keys itself. p.parent must be valid. // -// gcassert:inline -func (p *pebbleMVCCScanner) enablePointSynthesis() { +// Returns true if the iterator is valid. +func (p *pebbleMVCCScanner) enablePointSynthesis() bool { if util.RaceEnabled { if ok, _ := p.parent.Valid(); !ok { panic(errors.AssertionFailedf("enablePointSynthesis called with invalid iter")) @@ -1213,13 +1217,18 @@ func (p *pebbleMVCCScanner) enablePointSynthesis() { p.parent.UnsafeKey())) } } - p.pointIter = NewPointSynthesizingIterAtParent(p.parent) - p.parent = p.pointIter + pointIter, err := NewPointSynthesizingIterAtParent(p.parent) + if err != nil { + p.err = err + return false + } if util.RaceEnabled { - if ok, _ := p.parent.Valid(); !ok { + if ok, _ := pointIter.Valid(); !ok { panic(errors.AssertionFailedf("invalid pointSynthesizingIter with valid iter")) } } + p.pointIter, p.parent = pointIter, pointIter + return true } func (p *pebbleMVCCScanner) decodeCurrentMetadata() bool { @@ -1260,7 +1269,9 @@ func (p *pebbleMVCCScanner) iterValid() bool { // Since iterValid() is called after every iterator positioning operation, it // is convenient to check for any range keys and enable point synthesis here. if p.parent.RangeKeyChanged() { - p.enablePointSynthesis() + if !p.enablePointSynthesis() { + return false + } } if util.RaceEnabled && p.pointIter == nil { if _, hasRange := p.parent.HasPointAndRange(); hasRange { diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index f9f8d371d509..dc09ff945742 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -147,10 +147,10 @@ func NewPointSynthesizingIter(parent MVCCIterator) *PointSynthesizingIter { // NewPointSynthesizingIterAtParent creates a new pointSynthesizingIter and // loads the position from the parent iterator. -func NewPointSynthesizingIterAtParent(parent MVCCIterator) *PointSynthesizingIter { +func NewPointSynthesizingIterAtParent(parent MVCCIterator) (*PointSynthesizingIter, error) { iter := NewPointSynthesizingIter(parent) iter.rangeKeyChanged = true // force range key detection - if ok, err := iter.updateIter(); ok && err == nil { + if ok, _ := iter.updateIter(); ok { // updateSeekGEPosition may step parent and then compare against seekKey, so // we need to clone it. seekKey := parent.UnsafeKey() @@ -158,7 +158,11 @@ func NewPointSynthesizingIterAtParent(parent MVCCIterator) *PointSynthesizingIte seekKey.Key = iter.seekKeyBuf iter.updateSeekGEPosition(seekKey) } - return iter + if iter.iterErr != nil { + iter.release() + return nil, iter.iterErr + } + return iter, nil } // Close implements MVCCIterator. From 48aadd0dc8b8f7c52de0842b1b6c8aa0b42771d7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 31 Oct 2022 17:53:50 +0000 Subject: [PATCH 2/3] storage: fix error handling in `NewPointSynthesizingIterAtParent` Release note: None --- pkg/storage/point_synthesizing_iter.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index dc09ff945742..5ba385d7af59 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -158,9 +158,9 @@ func NewPointSynthesizingIterAtParent(parent MVCCIterator) (*PointSynthesizingIt seekKey.Key = iter.seekKeyBuf iter.updateSeekGEPosition(seekKey) } - if iter.iterErr != nil { + if err := iter.iterErr; err != nil { iter.release() - return nil, iter.iterErr + return nil, err } return iter, nil } From ffd14854ce7dcc3905589c792e7737b8555aa17b Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 30 Oct 2022 12:58:31 +0000 Subject: [PATCH 3/3] storage: fix point synthesis activation during reverse scans This patch fixes a bug where `pointSynthesizingIter` could appear to have skipped a synthetic point tombstone in the reverse direction when enabled dynamically by `pebbleMVCCScanner`. Consider the following case: ``` REAL DATASET SYNTHETIC DATASET 3 [b3] 3 [b3] 2 a2 2 a2 1 [---) 1 x a b a b ``` Recall that `pebbleMVCCScanner` only enables `pointSynthesizingIter` when it encounters a range key. In the case above, during a reverse scan, the `[a-b)@1` range key will first become visible to `pebbleMVCCScanner` when it lands on `a@2`, so it enabled point synthesis positioned at the `a@2` point key. Notice how the iterator has now skipped over the synthetic point tombstone `a@1`. This is particularly problematic when combined with `pebbleMVCCScanner` peeking, which assumes that following a `iterPeekPrev()` call, an `iterNext()` call can step the parent iterator forward once to get back to the original position. With the above bug, that is no longer true, as it instead lands on the synthetic point tombstone which was skipped during reverse iteration. During intent processing for `b@3`, such an `iterNext()` call is expected to land on the intent's provisional value at `b@3`, but it instead lands on the intent itself at `b@0`. This in turn caused a value checksum or decoding failure, where it was expecting the current key to be `b@3`, but the actual key was `b@0`. This patch fixes the bug by keeping track of the direction of the previous positioning operation in `pebbleMVCCScanner`, and then repositioning the `pointSynthesizingIter` as appropriate during dynamic activation in the reverse direction. This doesn't have a significant effect on performance (in particular, the point get path is never in reverse). Several alternative approaches were attempted but abandoned: * Don't synthesize points at a range key's start bound. While compelling for a range of reasons (complexity, performance, etc) this has two flaws: on a bare range key, the iterator still needs to know which direction to skip in, and we also need to know the timestamps of bare range keys for `FailOnMoreRecent` handling. * Only synthesize a single point at a range key's start bound. This would solve the problem at hand, but has rather strange semantics. Longer-term, it may be better to merge range key logic into `pebbleMVCCScanner` itself -- `pointSynthesizingIter` was intended to reduce complexity, but it's not clear that the overall complexity is actually reduced. Release note: None --- pkg/storage/pebble_mvcc_scanner.go | 14 +- pkg/storage/point_synthesizing_iter.go | 43 ++++- ...ge_tombstone_scans_reverse_skip_regression | 180 ++++++++++++++++++ 3 files changed, 235 insertions(+), 2 deletions(-) create mode 100644 pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go index 29934eb07c14..e170f648d630 100644 --- a/pkg/storage/pebble_mvcc_scanner.go +++ b/pkg/storage/pebble_mvcc_scanner.go @@ -309,6 +309,11 @@ func extractResultKey(repr []byte) roachpb.Key { // by switching to a pointSynthesizingIter. type pebbleMVCCScanner struct { parent MVCCIterator + // parentReverse is true if the previous parent positioning operation was a + // reverse operation (SeekLT or Prev). This is needed to correctly initialize + // pointIter when encountering an MVCC range tombstone in reverse. + // TODO(erikgrinaker): Consider adding MVCCIterator.IsReverse() instead. + parentReverse bool // pointIter is a point synthesizing iterator that wraps and replaces parent // when an MVCC range tombstone is encountered. A separate reference to it is // kept in order to release it back to its pool when the scanner is done. @@ -453,6 +458,7 @@ func (p *pebbleMVCCScanner) get(ctx context.Context) { } } + p.parentReverse = false p.parent.SeekGE(MVCCKey{Key: p.start}) if !p.updateCurrent() { return @@ -963,6 +969,7 @@ func (p *pebbleMVCCScanner) advanceKeyAtEnd() bool { // Iterating to the next key might have caused the iterator to reach the // end of the key space. If that happens, back up to the very last key. p.peeked = false + p.parentReverse = true p.parent.SeekLT(MVCCKey{Key: p.end}) if !p.updateCurrent() { return false @@ -1217,7 +1224,7 @@ func (p *pebbleMVCCScanner) enablePointSynthesis() bool { p.parent.UnsafeKey())) } } - pointIter, err := NewPointSynthesizingIterAtParent(p.parent) + pointIter, err := NewPointSynthesizingIterAtParent(p.parent, p.parentReverse) if err != nil { p.err = err return false @@ -1284,6 +1291,7 @@ func (p *pebbleMVCCScanner) iterValid() bool { // iterSeek seeks to the latest revision of the specified key (or a greater key). func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool { + p.parentReverse = false p.clearPeeked() p.parent.SeekGE(key) return p.updateCurrent() @@ -1291,6 +1299,7 @@ func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool { // iterSeekReverse seeks to the latest revision of the key before the specified key. func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool { + p.parentReverse = true p.clearPeeked() p.parent.SeekLT(key) if !p.updateCurrent() { @@ -1309,6 +1318,7 @@ func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool { // iterNext advances to the next MVCC key. func (p *pebbleMVCCScanner) iterNext() bool { + p.parentReverse = false if p.reverse && p.peeked { // If we have peeked at the previous entry, we need to advance the iterator // twice. @@ -1334,6 +1344,7 @@ func (p *pebbleMVCCScanner) iterNext() bool { // iterPrev advances to the previous MVCC Key. func (p *pebbleMVCCScanner) iterPrev() bool { + p.parentReverse = true if p.peeked { p.peeked = false return p.updateCurrent() @@ -1348,6 +1359,7 @@ func (p *pebbleMVCCScanner) iterPrev() bool { func (p *pebbleMVCCScanner) iterPeekPrev() ([]byte, bool) { if !p.peeked { p.peeked = true + p.parentReverse = true // We need to save a copy of the current iterator key and value and adjust // curRawKey, curKey and curValue to point to this saved data. We use a // single buffer for this purpose: savedBuf. diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index 5ba385d7af59..ed34ca4a68bf 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -147,7 +147,27 @@ func NewPointSynthesizingIter(parent MVCCIterator) *PointSynthesizingIter { // NewPointSynthesizingIterAtParent creates a new pointSynthesizingIter and // loads the position from the parent iterator. -func NewPointSynthesizingIterAtParent(parent MVCCIterator) (*PointSynthesizingIter, error) { +// +// If reverse is true, the point synthesizing iterator will reposition on +// a synthetic point tombstone after the current position if appropriate. +// Consider the following dataset: +// +// 2 a2 b2 +// 1 [---) +// a b +// +// If the caller was positioned on b@2, then called Prev() and landed on a@2, +// where it detected the [a-b)@2 range key and enabled point synthesis, it would +// appear to have skipped over the synthetic point tombstone at a@1. Setting +// reverse=true in this case will position the iterator on the synthetic point +// tombstone a@1. +// +// However, the above assumes that the previous positioning operation was from a +// different key -- if it was e.g. a SeekLT(a@1) then this repositioning will be +// incorrect. The caller must take care to use reverse only when appropriate. +func NewPointSynthesizingIterAtParent( + parent MVCCIterator, reverse bool, +) (*PointSynthesizingIter, error) { iter := NewPointSynthesizingIter(parent) iter.rangeKeyChanged = true // force range key detection if ok, _ := iter.updateIter(); ok { @@ -157,6 +177,27 @@ func NewPointSynthesizingIterAtParent(parent MVCCIterator) (*PointSynthesizingIt iter.seekKeyBuf = append(iter.seekKeyBuf[:0], seekKey.Key...) seekKey.Key = iter.seekKeyBuf iter.updateSeekGEPosition(seekKey) + if err := iter.iterErr; err != nil { + iter.release() + return nil, err + } + + // If we're on the start key of a range key in the reverse direction, we + // must take care not to skip any synthetic point tombstones that the caller + // would have expected to be emitted. This essentially means switching the + // iterator to reverse iteration and repositioning on the oldest version + // (real or synthetic). + if reverse && len(iter.rangeKeys) > 0 && iter.rangeKeysPos.Equal(iter.rangeKeysStart) { + if !iter.atPoint { + if _, err := iter.iterPrev(); err != nil { + iter.release() + return nil, err + } + } + iter.reverse = true + iter.rangeKeysIdx = iter.rangeKeysEnd - 1 + iter.updateAtPoint() + } } if err := iter.iterErr; err != nil { iter.release() diff --git a/pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression b/pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression new file mode 100644 index 000000000000..f2fcba081bd9 --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/range_tombstone_scans_reverse_skip_regression @@ -0,0 +1,180 @@ +# Regression test for https://github.com/cockroachdb/cockroach/issues/90642. +# +# REAL DATASET SYNTHETIC DATASET +# 3 [b3] 3 [b3] +# 2 a2 2 a2 +# 1 [---) 1 x +# a b a b +# +# Recall that pebbleMVCCScanner only enables pointSynthesizingIter when it +# encounters a range key. In the case above, during a reverse scan, the [a-b)@1 +# range key will first become visible to pebbleMVCCScanner when it lands on a@2, +# so it enabled point synthesis positioned at the a@2 point key. Notice how the +# iterator has now skipped over the synthetic point tombstone a@1. +# +# This is particularly problematic when combined with pebbleMVCCScanner peeking, +# which assumes that following a iterPeekPrev() call, an iterNext() call can +# step the parent iterator forward once to get back to the original position. +# With the above bug, that is no longer true, as it instead lands on the +# synthetic point tombstone which was skipped during reverse iteration. During +# intent processing for b@3, such an iterNext() call is expected to land on the +# intent's provisional value at b@3, but it instead lands on the intent itself +# at b@0. This in turn caused a value checksum or decoding failure, where it was +# expecting the current key to be b@3, but the actual key was b@0. +run ok +del_range_ts k=a end=b ts=1 +put k=a ts=2 v=a2 +with t=A + txn_begin k=b ts=3 + put k=b v=b3 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[1.000000000,0=/] +data: "a"/2.000000000,0 -> /BYTES/a2 +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} ts=3.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/3.000000000,0 -> /BYTES/b3 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b3 @3.000000000,0 +scan: "a" -> /BYTES/a2 @2.000000000,0 + +# We also test the same scenario with a double range tombstone, i.e.: +# +# 3 [b3] +# 2 [---) +# 1 [---) +# a b +run ok +txn_remove t=A +clear_range k=a end=z +del_range_ts k=a end=b ts=1 +del_range_ts k=a end=b ts=2 +with t=A + txn_begin k=b ts=3 + put k=b v=b3 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[2.000000000,0=/ 1.000000000,0=/] +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} ts=3.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/3.000000000,0 -> /BYTES/b3 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b3 @3.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones +---- +scan: "b" -> /BYTES/b3 @3.000000000,0 +scan: "a" -> / @2.000000000,0 + +# And with a point between the range tombstone. We place the intent at the +# lowest timestamp, which is a contrived/unrealistic scenario, and do tombstone +# scans at all timestamps. +# +# 3 [---) +# 3 a3 +# 2 [---) +# 1 [b1] +# a b +run ok +txn_remove t=A +clear_range k=a end=z +del_range_ts k=a end=b ts=2 +put k=a ts=3 v=a3 +del_range_ts k=a end=b ts=4 +with t=A + txn_begin k=b ts=1 + put k=b v=b1 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[4.000000000,0=/ 2.000000000,0=/] +data: "a"/3.000000000,0 -> /BYTES/a3 +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} ts=1.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/1.000000000,0 -> /BYTES/b1 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=4 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @4.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=3 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> /BYTES/a3 @3.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=2 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @2.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=1 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 + +# And a point below them. +# +# 4 [b4] +# 3 [---) +# 2 [---) +# 1 a1 +# a b +run ok +txn_remove t=A +clear_range k=a end=z +put k=a ts=2 v=a2 +del_range_ts k=a end=b ts=3 +del_range_ts k=a end=b ts=4 +with t=A + txn_begin k=b ts=1 + put k=b v=b1 +---- +>> at end: +txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false gul=0,0 +rangekey: {a-b}/[4.000000000,0=/ 3.000000000,0=/] +data: "a"/2.000000000,0 -> /BYTES/a2 +meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} ts=1.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "b"/1.000000000,0 -> /BYTES/b1 + +run ok +scan t=A k=a end=z reverse +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=4 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @4.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=3 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> / @3.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=2 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0 +scan: "a" -> /BYTES/a2 @2.000000000,0 + +run ok +scan t=A k=a end=z reverse tombstones ts=1 +---- +scan: "b" -> /BYTES/b1 @1.000000000,0