Skip to content

Commit

Permalink
storage: use NextPrefix
Browse files Browse the repository at this point in the history
Update pebbleIterator to use the new NextPrefix method exposed by Pebble
Iterators (cockroachdb/pebble#1860). NextPrefix implements semanatics
analgous to MVCCIterator.NextKey and is able to make use of low-level
optimizations to avoid much of the work of a full seek. Additionally in
the case where the very next key is a new MVCC user key, the
pebbleMVCCScanner is able to use NextPrefix to avoid a key copy and a
key comparison.

```
name                                                                      old speed      new speed       delta
MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=0-24          6.20MB/s ± 2%   6.24MB/s ± 1%     ~     (p=0.524 n=5+5)
MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=1-24          3.71MB/s ± 1%   3.90MB/s ± 1%   +5.01%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=100-24         376kB/s ± 2%    670kB/s ± 0%  +78.19%  (p=0.016 n=5+4)
MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=0-24          4.87MB/s ± 0%   5.00MB/s ± 2%   +2.71%  (p=0.032 n=5+5)
MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=1-24          3.08MB/s ± 1%   3.18MB/s ± 1%   +3.38%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=100-24         420kB/s ± 0%    758kB/s ± 2%  +80.48%  (p=0.016 n=4+5)
MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=0-24         3.21MB/s ± 1%   3.27MB/s ± 2%   +1.99%  (p=0.024 n=5+5)
MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=1-24         2.11MB/s ± 2%   2.23MB/s ± 2%   +5.89%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=100-24        410kB/s ± 2%    688kB/s ± 2%  +67.80%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=0-24        1.55MB/s ± 1%   1.65MB/s ± 2%   +6.19%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=1-24        1.26MB/s ± 2%   1.33MB/s ± 3%   +5.87%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=100-24       460kB/s ± 0%    536kB/s ± 3%  +16.52%  (p=0.016 n=4+5)
MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=0-24         41.4MB/s ± 2%   41.2MB/s ± 2%     ~     (p=0.516 n=5+5)
MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=1-24         26.5MB/s ± 2%   27.1MB/s ± 1%   +2.42%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=100-24       5.37MB/s ± 2%   6.17MB/s ± 1%  +14.74%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=0-24         32.7MB/s ± 2%   31.5MB/s ± 3%   -3.51%  (p=0.032 n=5+5)
MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=1-24         21.4MB/s ± 0%   21.6MB/s ± 1%   +1.18%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=100-24       5.88MB/s ± 1%   6.74MB/s ± 0%  +14.51%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=0-24        17.2MB/s ± 2%   18.5MB/s ± 1%   +7.73%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=1-24        11.6MB/s ± 2%   13.2MB/s ± 2%  +13.93%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=100-24      4.56MB/s ± 1%   5.58MB/s ± 1%  +22.36%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=0-24       6.79MB/s ± 3%   7.50MB/s ± 2%  +10.45%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=1-24       5.53MB/s ± 2%   6.31MB/s ± 2%  +13.95%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=100-24     2.94MB/s ± 1%   3.54MB/s ± 1%  +20.27%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=0-24         125MB/s ± 1%    124MB/s ± 1%     ~     (p=0.548 n=5+5)
MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=1-24        84.2MB/s ± 3%   86.1MB/s ± 2%     ~     (p=0.222 n=5+5)
MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=100-24      33.7MB/s ± 3%   38.0MB/s ± 1%  +12.83%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=0-24        92.7MB/s ± 1%   94.5MB/s ± 1%   +1.90%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=1-24        62.5MB/s ± 1%   65.4MB/s ± 1%   +4.75%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=100-24      32.0MB/s ± 1%   36.4MB/s ± 1%  +13.51%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=0-24       36.5MB/s ± 3%   41.3MB/s ± 4%  +13.05%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=1-24       23.7MB/s ± 2%   30.8MB/s ± 2%  +29.93%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=100-24     15.8MB/s ± 1%   21.3MB/s ± 2%  +34.67%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=0-24      11.1MB/s ± 1%   12.6MB/s ± 2%  +13.08%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=1-24      9.06MB/s ± 2%  11.08MB/s ± 2%  +22.29%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=100-24    6.65MB/s ± 1%   8.91MB/s ± 2%  +33.95%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=0-24        175MB/s ± 2%    176MB/s ± 2%     ~     (p=0.421 n=5+5)
MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=1-24        120MB/s ± 1%    124MB/s ± 1%   +3.59%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=100-24     81.3MB/s ± 2%   87.5MB/s ± 1%   +7.62%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=0-24        122MB/s ± 1%    132MB/s ± 2%   +7.88%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=1-24       82.5MB/s ± 2%   90.5MB/s ± 2%   +9.70%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=100-24     65.2MB/s ± 1%   72.0MB/s ± 1%  +10.41%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=0-24      42.9MB/s ± 1%   50.2MB/s ± 2%  +17.02%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=1-24      27.3MB/s ± 1%   36.7MB/s ± 1%  +34.49%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=100-24    21.4MB/s ± 3%   31.4MB/s ± 4%  +46.71%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=0-24     12.5MB/s ± 3%   14.6MB/s ± 5%  +16.55%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=1-24     10.3MB/s ± 4%   13.2MB/s ± 2%  +28.04%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=100-24   8.00MB/s ± 4%  11.47MB/s ± 4%  +43.25%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=0-24       193MB/s ± 2%    197MB/s ± 1%   +2.33%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=1-24       130MB/s ± 2%    133MB/s ± 1%   +2.05%  (p=0.032 n=5+5)
MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=100-24     104MB/s ± 2%    108MB/s ± 1%   +3.41%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=0-24       137MB/s ± 1%    143MB/s ± 2%   +4.14%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=1-24      89.4MB/s ± 2%   95.8MB/s ± 0%   +7.10%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=100-24    77.3MB/s ± 2%   82.3MB/s ± 2%   +6.47%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=0-24     44.9MB/s ± 1%   52.7MB/s ± 3%  +17.30%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=1-24     28.6MB/s ± 2%   39.0MB/s ± 5%  +36.19%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=100-24   22.6MB/s ± 7%   34.5MB/s ± 5%  +52.68%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=0-24    13.2MB/s ± 3%   15.2MB/s ± 4%  +15.51%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=1-24    10.5MB/s ± 4%   13.4MB/s ± 2%  +27.11%  (p=0.008 n=5+5)
MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=100-24  8.13MB/s ± 6%  11.87MB/s ± 3%  +45.98%  (p=0.008 n=5+5)
```

Epic: None
Informs cockroachdb#83049.
Release note: None
  • Loading branch information
jbowens committed Jan 5, 2023
1 parent 71bb699 commit 72432a9
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 62 deletions.
48 changes: 3 additions & 45 deletions pkg/storage/pebble_iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,9 +454,6 @@ func (p *pebbleIterator) NextEngineKeyWithLimit(

// NextKey implements the MVCCIterator interface.
func (p *pebbleIterator) NextKey() {
// Even though NextKey() is not allowed for switching direction by the
// MVCCIterator interface, pebbleIterator works correctly even when
// switching direction. So we set mvccDirIsReverse = false.
if p.mvccDirIsReverse {
// Switching directions.
p.mvccDirIsReverse = false
Expand All @@ -468,49 +465,10 @@ func (p *pebbleIterator) NextKey() {
if valid, err := p.Valid(); err != nil || !valid {
return
}
p.keyBuf = append(p.keyBuf[:0], p.UnsafeKey().Key...)
if !p.iter.Next() {
return
}

// Prefix iterators can't move onto a separate key by definition, so we
// exhaust the iterator. We could just set mvccDone, but that wouldn't
// propagate RangeKeyChanged() correctly.
if p.prefix {
// Seek to the latest possible key for this prefix, exhausting iter.
seekKey := append(p.keyBuf,
append([]byte{0}, EncodeMVCCTimestampSuffix(hlc.MinTimestamp)...)...)
if p.iter.SeekPrefixGE(seekKey) {
// In practice we'll never hit this loop. It's included for completeness.
for p.iter.Next() {
}
}
return
}

// If the Next() call above didn't move to a different key, seek to it.
if p.UnsafeKey().Key.Equal(p.keyBuf) {
// This is equivalent to:
// p.iter.SeekGE(EncodeKey(MVCCKey{p.UnsafeKey().Key.Next(), hlc.Timestamp{}}))
seekKey := append(p.keyBuf, 0, 0)
p.iter.SeekGE(seekKey)
// If there's a range key straddling the seek point (e.g. a-c when seeking
// to b), it will be surfaced first as a bare range key. However, unless it
// started exactly at the seek key then it has already been emitted, so we
// step past it to the next key, which may be either a point key or range
// key starting past the seek key.
//
// NB: We have to be careful to use p.iter methods below, rather than
// pebbleIterator methods, since seekKey is an already-encoded roachpb.Key
// in raw Pebble key form.
if p.iter.Valid() {
if hasPoint, hasRange := p.iter.HasPointAndRange(); !hasPoint && hasRange {
if startKey, _ := p.iter.RangeBounds(); bytes.Compare(startKey, seekKey) < 0 {
p.iter.Next()
}
}
}
}
// NB: If p.prefix, iterators can't move onto a separate key by definition,
// so the below call to NextPrefix will exhaust the iterator.
p.iter.NextPrefix()
}

// UnsafeKey implements the MVCCIterator interface.
Expand Down
27 changes: 10 additions & 17 deletions pkg/storage/pebble_mvcc_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -961,27 +961,20 @@ func (p *pebbleMVCCScanner) getOne(ctx context.Context) (ok, added bool) {

// nextKey advances to the next user key.
func (p *pebbleMVCCScanner) nextKey() bool {
p.keyBuf = append(p.keyBuf[:0], p.curUnsafeKey.Key...)

for i := 0; i < p.itersBeforeSeek; i++ {
if p.reverse && p.peeked {
// If the parent iterator is in reverse because we've peeked, then we
// can step the iterator once to land back onto the current key before
// we fallthrough to call NextKey.
if !p.iterNext() {
return false
}
if !bytes.Equal(p.curUnsafeKey.Key, p.keyBuf) {
p.incrementItersBeforeSeek()
return true
}
// Fallthrough to NextKey.
}

p.decrementItersBeforeSeek()
// We're pointed at a different version of the same key. Fall back to
// seeking to the next key. We append a NUL to account for the "next-key".
// Note that we cannot rely on curUnsafeKey.Key being unchanged even though
// we are at a different version of the same key -- the underlying
// MVCCIterator is free to mutate the backing for p.curUnsafeKey.Key
// arbitrarily. Therefore we use p.keyBuf here which we have handy.
p.keyBuf = append(p.keyBuf, 0)
return p.iterSeek(MVCCKey{Key: p.keyBuf})
p.parent.NextKey()
if !p.iterValid() {
return false
}
return p.updateCurrent()
}

// backwardLatestVersion backs up the iterator to the latest version for the
Expand Down

0 comments on commit 72432a9

Please sign in to comment.