Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

storage: fix point synthesis activation during reverse scans #90953

Merged
merged 2 commits into from
Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion pkg/storage/pebble_mvcc_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,11 @@ func extractResultKey(repr []byte) roachpb.Key {
// by switching to a pointSynthesizingIter.
type pebbleMVCCScanner struct {
parent MVCCIterator
// parentReverse is true if the previous parent positioning operation was a
// reverse operation (SeekLT or Prev). This is needed to correctly initialize
// pointIter when encountering an MVCC range tombstone in reverse.
// TODO(erikgrinaker): Consider adding MVCCIterator.IsReverse() instead.
parentReverse bool
// pointIter is a point synthesizing iterator that wraps and replaces parent
// when an MVCC range tombstone is encountered. A separate reference to it is
// kept in order to release it back to its pool when the scanner is done.
Expand Down Expand Up @@ -453,6 +458,7 @@ func (p *pebbleMVCCScanner) get(ctx context.Context) {
}
}

p.parentReverse = false
p.parent.SeekGE(MVCCKey{Key: p.start})
if !p.updateCurrent() {
return
Expand Down Expand Up @@ -963,6 +969,7 @@ func (p *pebbleMVCCScanner) advanceKeyAtEnd() bool {
// Iterating to the next key might have caused the iterator to reach the
// end of the key space. If that happens, back up to the very last key.
p.peeked = false
p.parentReverse = true
p.parent.SeekLT(MVCCKey{Key: p.end})
if !p.updateCurrent() {
return false
Expand Down Expand Up @@ -1217,7 +1224,7 @@ func (p *pebbleMVCCScanner) enablePointSynthesis() bool {
p.parent.UnsafeKey()))
}
}
pointIter, err := NewPointSynthesizingIterAtParent(p.parent)
pointIter, err := NewPointSynthesizingIterAtParent(p.parent, p.parentReverse)
if err != nil {
p.err = err
return false
Expand Down Expand Up @@ -1284,13 +1291,15 @@ func (p *pebbleMVCCScanner) iterValid() bool {

// iterSeek seeks to the latest revision of the specified key (or a greater key).
func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool {
p.parentReverse = false
p.clearPeeked()
p.parent.SeekGE(key)
return p.updateCurrent()
}

// iterSeekReverse seeks to the latest revision of the key before the specified key.
func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool {
p.parentReverse = true
p.clearPeeked()
p.parent.SeekLT(key)
if !p.updateCurrent() {
Expand All @@ -1309,6 +1318,7 @@ func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool {

// iterNext advances to the next MVCC key.
func (p *pebbleMVCCScanner) iterNext() bool {
p.parentReverse = false
if p.reverse && p.peeked {
// If we have peeked at the previous entry, we need to advance the iterator
// twice.
Expand All @@ -1334,6 +1344,7 @@ func (p *pebbleMVCCScanner) iterNext() bool {

// iterPrev advances to the previous MVCC Key.
func (p *pebbleMVCCScanner) iterPrev() bool {
p.parentReverse = true
if p.peeked {
p.peeked = false
return p.updateCurrent()
Expand All @@ -1348,6 +1359,7 @@ func (p *pebbleMVCCScanner) iterPrev() bool {
func (p *pebbleMVCCScanner) iterPeekPrev() ([]byte, bool) {
if !p.peeked {
p.peeked = true
p.parentReverse = true
// We need to save a copy of the current iterator key and value and adjust
// curRawKey, curKey and curValue to point to this saved data. We use a
// single buffer for this purpose: savedBuf.
Expand Down
47 changes: 44 additions & 3 deletions pkg/storage/point_synthesizing_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,27 @@ func NewPointSynthesizingIter(parent MVCCIterator) *PointSynthesizingIter {

// NewPointSynthesizingIterAtParent creates a new pointSynthesizingIter and
// loads the position from the parent iterator.
func NewPointSynthesizingIterAtParent(parent MVCCIterator) (*PointSynthesizingIter, error) {
//
// If reverse is true, the point synthesizing iterator will reposition on
// a synthetic point tombstone after the current position if appropriate.
// Consider the following dataset:
//
// 2 a2 b2
// 1 [---)
// a b
//
// If the caller was positioned on b@2, then called Prev() and landed on a@2,
// where it detected the [a-b)@2 range key and enabled point synthesis, it would
// appear to have skipped over the synthetic point tombstone at a@1. Setting
// reverse=true in this case will position the iterator on the synthetic point
// tombstone a@1.
//
// However, the above assumes that the previous positioning operation was from a
// different key -- if it was e.g. a SeekLT(a@1) then this repositioning will be
// incorrect. The caller must take care to use reverse only when appropriate.
func NewPointSynthesizingIterAtParent(
parent MVCCIterator, reverse bool,
) (*PointSynthesizingIter, error) {
iter := NewPointSynthesizingIter(parent)
iter.rangeKeyChanged = true // force range key detection
if ok, _ := iter.updateIter(); ok {
Expand All @@ -157,10 +177,31 @@ func NewPointSynthesizingIterAtParent(parent MVCCIterator) (*PointSynthesizingIt
iter.seekKeyBuf = append(iter.seekKeyBuf[:0], seekKey.Key...)
seekKey.Key = iter.seekKeyBuf
iter.updateSeekGEPosition(seekKey)
if err := iter.iterErr; err != nil {
iter.release()
return nil, err
}

// If we're on the start key of a range key in the reverse direction, we
// must take care not to skip any synthetic point tombstones that the caller
// would have expected to be emitted. This essentially means switching the
// iterator to reverse iteration and repositioning on the oldest version
// (real or synthetic).
if reverse && len(iter.rangeKeys) > 0 && iter.rangeKeysPos.Equal(iter.rangeKeysStart) {
if !iter.atPoint {
if _, err := iter.iterPrev(); err != nil {
iter.release()
return nil, err
}
}
iter.reverse = true
iter.rangeKeysIdx = iter.rangeKeysEnd - 1
iter.updateAtPoint()
}
}
if iter.iterErr != nil {
if err := iter.iterErr; err != nil {
iter.release()
return nil, iter.iterErr
return nil, err
}
return iter, nil
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# Regression test for https://github.com/cockroachdb/cockroach/issues/90642.
#
# REAL DATASET SYNTHETIC DATASET
# 3 [b3] 3 [b3]
# 2 a2 2 a2
# 1 [---) 1 x
# a b a b
#
# Recall that pebbleMVCCScanner only enables pointSynthesizingIter when it
# encounters a range key. In the case above, during a reverse scan, the [a-b)@1
# range key will first become visible to pebbleMVCCScanner when it lands on a@2,
# so it enabled point synthesis positioned at the a@2 point key. Notice how the
# iterator has now skipped over the synthetic point tombstone a@1.
#
# This is particularly problematic when combined with pebbleMVCCScanner peeking,
# which assumes that following a iterPeekPrev() call, an iterNext() call can
# step the parent iterator forward once to get back to the original position.
# With the above bug, that is no longer true, as it instead lands on the
# synthetic point tombstone which was skipped during reverse iteration. During
# intent processing for b@3, such an iterNext() call is expected to land on the
# intent's provisional value at b@3, but it instead lands on the intent itself
# at b@0. This in turn caused a value checksum or decoding failure, where it was
# expecting the current key to be b@3, but the actual key was b@0.
run ok
del_range_ts k=a end=b ts=1
put k=a ts=2 v=a2
with t=A
txn_begin k=b ts=3
put k=b v=b3
----
>> at end:
txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false gul=0,0
rangekey: {a-b}/[1.000000000,0=/<empty>]
data: "a"/2.000000000,0 -> /BYTES/a2
meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} ts=3.000000000,0 del=false klen=12 vlen=7 mergeTs=<nil> txnDidNotUpdateMeta=true
data: "b"/3.000000000,0 -> /BYTES/b3

run ok
scan t=A k=a end=z reverse
----
scan: "b" -> /BYTES/b3 @3.000000000,0
scan: "a" -> /BYTES/a2 @2.000000000,0

# We also test the same scenario with a double range tombstone, i.e.:
#
# 3 [b3]
# 2 [---)
# 1 [---)
# a b
run ok
txn_remove t=A
clear_range k=a end=z
del_range_ts k=a end=b ts=1
del_range_ts k=a end=b ts=2
with t=A
txn_begin k=b ts=3
put k=b v=b3
----
>> at end:
txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=3.000000000,0 wto=false gul=0,0
rangekey: {a-b}/[2.000000000,0=/<empty> 1.000000000,0=/<empty>]
meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=3.000000000,0 min=0,0 seq=0} ts=3.000000000,0 del=false klen=12 vlen=7 mergeTs=<nil> txnDidNotUpdateMeta=true
data: "b"/3.000000000,0 -> /BYTES/b3

run ok
scan t=A k=a end=z reverse
----
scan: "b" -> /BYTES/b3 @3.000000000,0

run ok
scan t=A k=a end=z reverse tombstones
----
scan: "b" -> /BYTES/b3 @3.000000000,0
scan: "a" -> /<empty> @2.000000000,0

# And with a point between the range tombstone. We place the intent at the
# lowest timestamp, which is a contrived/unrealistic scenario, and do tombstone
# scans at all timestamps.
#
# 3 [---)
# 3 a3
# 2 [---)
# 1 [b1]
# a b
run ok
txn_remove t=A
clear_range k=a end=z
del_range_ts k=a end=b ts=2
put k=a ts=3 v=a3
del_range_ts k=a end=b ts=4
with t=A
txn_begin k=b ts=1
put k=b v=b1
----
>> at end:
txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false gul=0,0
rangekey: {a-b}/[4.000000000,0=/<empty> 2.000000000,0=/<empty>]
data: "a"/3.000000000,0 -> /BYTES/a3
meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} ts=1.000000000,0 del=false klen=12 vlen=7 mergeTs=<nil> txnDidNotUpdateMeta=true
data: "b"/1.000000000,0 -> /BYTES/b1

run ok
scan t=A k=a end=z reverse
----
scan: "b" -> /BYTES/b1 @1.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=4
----
scan: "b" -> /BYTES/b1 @1.000000000,0
scan: "a" -> /<empty> @4.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=3
----
scan: "b" -> /BYTES/b1 @1.000000000,0
scan: "a" -> /BYTES/a3 @3.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=2
----
scan: "b" -> /BYTES/b1 @1.000000000,0
scan: "a" -> /<empty> @2.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=1
----
scan: "b" -> /BYTES/b1 @1.000000000,0

# And a point below them.
#
# 4 [b4]
# 3 [---)
# 2 [---)
# 1 a1
# a b
run ok
txn_remove t=A
clear_range k=a end=z
put k=a ts=2 v=a2
del_range_ts k=a end=b ts=3
del_range_ts k=a end=b ts=4
with t=A
txn_begin k=b ts=1
put k=b v=b1
----
>> at end:
txn: "A" meta={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=1.000000000,0 wto=false gul=0,0
rangekey: {a-b}/[4.000000000,0=/<empty> 3.000000000,0=/<empty>]
data: "a"/2.000000000,0 -> /BYTES/a2
meta: "b"/0,0 -> txn={id=00000000 key="b" pri=0.00000000 epo=0 ts=1.000000000,0 min=0,0 seq=0} ts=1.000000000,0 del=false klen=12 vlen=7 mergeTs=<nil> txnDidNotUpdateMeta=true
data: "b"/1.000000000,0 -> /BYTES/b1

run ok
scan t=A k=a end=z reverse
----
scan: "b" -> /BYTES/b1 @1.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=4
----
scan: "b" -> /BYTES/b1 @1.000000000,0
scan: "a" -> /<empty> @4.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=3
----
scan: "b" -> /BYTES/b1 @1.000000000,0
scan: "a" -> /<empty> @3.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=2
----
scan: "b" -> /BYTES/b1 @1.000000000,0
scan: "a" -> /BYTES/a2 @2.000000000,0

run ok
scan t=A k=a end=z reverse tombstones ts=1
----
scan: "b" -> /BYTES/b1 @1.000000000,0