From 76029d54c7f124a919343829f67bd9c563224b0e Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Sun, 13 Nov 2022 10:08:25 -0500 Subject: [PATCH] internal/keyspan: modify FragmentIterator seek semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alter the semantics of SeekGE and SeekLT on FragmentIterator. Previously, FragmentIterator's seek operations were defined only in terms of span start keys. This commit changes the seek operations to be defined in terms of the keys contained by the span. A SeekGE now seeks to the first span containing a key ≥ the seek key, and a SeekLT now seeks to the last span containing a key < seek key. These new semantics match the typical top-level iterator use. With these new semantics, SeekLT can still be implemented in terms of a simple span start key seek. Seeking to the last span containing a key < seek key is equivalent to seeking to the last span with a start key less the given key. However, SeekGE implementations now require an extra key comparison and sometimes a Next. Since top-level iterator requires the containment semantics anyways, this key comparison and Next is only being moved down the stack into the interface implementation. When using the keyspan merging iterator, the keyspan.MergingIter's SeekGE implementation performs a SeekLT per-level, which suffers no additional overhead. With the MergingIter and DefragmentingIter implementations, these new semantics reduce the amount of work performed during a seek. The previous iterator stack's SeekGE looked like (left-to-right, top-down): InterleavingIter.SeekGE │ ╭────────────────┴───────────────╮ │ │ DefragmentingIter.SeekLT DefragmentingIter.Next() │ │ ╭───────────┴───╮ │ │ │ │ MergingIter.SeekLT ├── defragmentFwd ├── defragmentFwd │ │ │ │ ╰── defragmentBwd ╰── defragmentFwd ╰───────────╮ │ ╭───────────┴───────────╮ │ │ MergingIter.SeekGE MergingIter.Prev │ ╰─╶╶ per level╶╶ ─╮ │ ╭───────────┴───────────╮ │ │ .SeekLT .Next The new iterator stack's SeekGE, assuming it doesn't hit the new defragmenting fast path, looks like: InterleavingIter.SeekGE │ DefragmentingIter.SeekGE │ ╭────────────────┴───────────────╮ │ ├── defragmentBwd* MergingIter.SeekGE │ │ ╰── defragmentFwd ╰─╶╶ per level╶╶ ─╮ │ │ ├── .SeekLT │ ╰── .Next * — The call to defragmentBackward during SeekGE may now sometimes be elided, specifically if the span discovered by MergingIter.SeekGE does not contain the seek key within its bounds. Note that in this interface, there are no calls to any of the leaf FragmentIterator's SeekGE methods which would suffer the extra key comparison and Next. Instead, the MergingIter calls SeekLT and unconditionally Nexts each of the leafs as a part of its logic to fragment bounds across levels. This reduced work for seeks has a large impact on the MVCCGet and MVCCScan microbenchmarks in the presence of range keys. ``` name old time/op new time/op delta MVCCGet_Pebble/batch=false/versions=1/valueSize=8/numRangeKeys=0-24 6.30µs ± 1% 6.22µs ± 2% ~ (p=0.095 n=5+5) MVCCGet_Pebble/batch=false/versions=1/valueSize=8/numRangeKeys=1-24 11.5µs ± 1% 10.3µs ± 1% -9.95% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=1/valueSize=8/numRangeKeys=100-24 118µs ± 1% 79µs ± 2% -33.14% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=10/valueSize=8/numRangeKeys=0-24 23.9µs ± 1% 24.1µs ± 2% ~ (p=0.310 n=5+5) MVCCGet_Pebble/batch=false/versions=10/valueSize=8/numRangeKeys=1-24 31.7µs ± 2% 29.6µs ± 1% -6.65% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=10/valueSize=8/numRangeKeys=100-24 109µs ± 1% 69µs ± 2% -36.58% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=100/valueSize=8/numRangeKeys=0-24 100µs ± 1% 99µs ± 3% ~ (p=0.310 n=5+5) MVCCGet_Pebble/batch=false/versions=100/valueSize=8/numRangeKeys=1-24 110µs ± 1% 106µs ± 2% -3.24% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=100/valueSize=8/numRangeKeys=100-24 197µs ± 2% 153µs ± 1% -22.75% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=0-24 3.74µs ± 1% 3.57µs ± 1% -4.47% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=1-24 6.01µs ± 1% 4.93µs ± 2% -17.86% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=100-24 66.1µs ± 1% 28.8µs ± 1% -56.35% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=10/valueSize=8/numRangeKeys=0-24 20.4µs ± 1% 20.4µs ± 1% ~ (p=0.690 n=5+5) MVCCGet_Pebble/batch=true/versions=10/valueSize=8/numRangeKeys=1-24 25.9µs ± 1% 23.9µs ± 3% -7.79% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=10/valueSize=8/numRangeKeys=100-24 89.3µs ± 1% 50.2µs ± 2% -43.76% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=100/valueSize=8/numRangeKeys=0-24 98.7µs ± 1% 97.9µs ± 1% ~ (p=0.151 n=5+5) MVCCGet_Pebble/batch=true/versions=100/valueSize=8/numRangeKeys=1-24 106µs ± 1% 103µs ± 1% -2.63% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=100/valueSize=8/numRangeKeys=100-24 179µs ± 3% 131µs ± 2% -26.75% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=0-24 10.9µs ± 3% 10.7µs ± 1% ~ (p=0.151 n=5+5) MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=1-24 17.9µs ± 1% 16.1µs ± 2% -10.35% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=100-24 172µs ± 1% 94µs ± 2% -45.23% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=0-24 13.0µs ± 1% 13.1µs ± 2% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=1-24 21.1µs ± 1% 19.0µs ± 2% -9.70% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=100-24 158µs ± 1% 83µs ± 3% -47.57% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=0-24 20.3µs ± 1% 20.1µs ± 2% ~ (p=0.151 n=5+5) MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=1-24 30.6µs ± 2% 27.6µs ± 1% -9.70% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=100-24 160µs ± 2% 88µs ± 3% -45.10% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=0-24 41.5µs ± 1% 41.1µs ± 1% -0.97% (p=0.048 n=5+5) MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=1-24 50.9µs ± 1% 48.6µs ± 2% -4.67% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=100-24 140µs ± 2% 94µs ± 1% -32.43% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=0-24 15.6µs ± 2% 16.1µs ± 1% +3.21% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=1-24 24.0µs ± 1% 23.1µs ± 2% -3.87% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=100-24 117µs ± 1% 78µs ± 2% -33.17% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=0-24 20.1µs ± 1% 20.4µs ± 1% +1.30% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=1-24 30.1µs ± 1% 28.5µs ± 1% -5.25% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=100-24 109µs ± 2% 70µs ± 1% -36.07% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=0-24 37.7µs ± 2% 38.1µs ± 1% ~ (p=0.056 n=5+5) MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=1-24 55.5µs ± 2% 53.9µs ± 1% -2.79% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=100-24 140µs ± 2% 101µs ± 1% -27.99% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=0-24 97.5µs ± 3% 96.1µs ± 2% ~ (p=0.095 n=5+5) MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=1-24 117µs ± 4% 115µs ± 1% ~ (p=0.151 n=5+5) MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=100-24 216µs ± 1% 176µs ± 4% -18.57% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=0-24 50.9µs ± 1% 53.2µs ± 2% +4.37% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=1-24 75.3µs ± 2% 74.8µs ± 2% ~ (p=0.548 n=5+5) MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=100-24 184µs ± 3% 141µs ± 1% -23.61% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=0-24 68.5µs ± 2% 70.7µs ± 1% +3.27% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=1-24 100µs ± 3% 102µs ± 1% ~ (p=0.222 n=5+5) MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=100-24 192µs ± 1% 149µs ± 2% -22.12% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=0-24 172µs ± 0% 176µs ± 1% +2.48% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=1-24 266µs ± 2% 271µs ± 2% +1.88% (p=0.032 n=5+5) MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=100-24 404µs ± 3% 364µs ± 3% -9.69% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=0-24 579µs ± 1% 578µs ± 1% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=1-24 704µs ± 1% 706µs ± 2% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=100-24 965µs ± 2% 923µs ± 3% -4.45% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=0-24 357µs ± 1% 372µs ± 1% +4.34% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=1-24 529µs ± 2% 546µs ± 1% +3.26% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=100-24 757µs ± 1% 691µs ± 1% -8.72% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=0-24 496µs ± 2% 507µs ± 2% ~ (p=0.222 n=5+5) MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=1-24 758µs ± 1% 778µs ± 1% +2.63% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=100-24 968µs ± 2% 904µs ± 1% -6.58% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=0-24 1.46ms ± 4% 1.50ms ± 2% ~ (p=0.421 n=5+5) MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=1-24 2.36ms ± 2% 2.35ms ± 1% ~ (p=0.841 n=5+5) MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=100-24 2.97ms ± 5% 2.91ms ± 2% ~ (p=0.151 n=5+5) MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=0-24 5.12ms ± 3% 5.08ms ± 3% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=1-24 6.38ms ± 2% 6.34ms ± 2% ~ (p=0.548 n=5+5) MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=100-24 8.11ms ± 3% 7.97ms ± 5% ~ (p=0.310 n=5+5) MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=0-24 3.56ms ± 1% 3.37ms ± 1% -5.61% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=1-24 5.32ms ± 1% 5.12ms ± 2% -3.90% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=100-24 6.35ms ± 1% 6.25ms ± 1% -1.59% (p=0.016 n=5+5) MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=0-24 4.91ms ± 2% 4.90ms ± 1% ~ (p=1.000 n=5+5) MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=1-24 7.41ms ± 1% 7.26ms ± 1% -2.10% (p=0.032 n=5+5) MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=100-24 8.48ms ± 1% 8.42ms ± 1% ~ (p=0.095 n=5+5) MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=0-24 14.3ms ± 3% 14.4ms ± 1% ~ (p=0.310 n=5+5) MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=1-24 22.7ms ± 2% 22.6ms ± 2% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=100-24 27.7ms ± 3% 28.0ms ± 3% ~ (p=0.548 n=5+5) MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=0-24 51.8ms ± 1% 50.4ms ± 5% ~ (p=0.151 n=5+5) MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=1-24 64.0ms ± 6% 63.0ms ± 4% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=100-24 83.4ms ± 7% 84.3ms ± 4% ~ (p=0.841 n=5+5) name old speed new speed delta MVCCGet_Pebble/batch=false/versions=1/valueSize=8/numRangeKeys=0-24 1.27MB/s ± 2% 1.28MB/s ± 2% ~ (p=0.119 n=5+5) MVCCGet_Pebble/batch=false/versions=1/valueSize=8/numRangeKeys=1-24 696kB/s ± 1% 774kB/s ± 1% +11.21% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=1/valueSize=8/numRangeKeys=100-24 70.0kB/s ± 0% 100.0kB/s ± 0% +42.86% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=10/valueSize=8/numRangeKeys=0-24 336kB/s ± 2% 330kB/s ± 0% ~ (p=0.095 n=5+4) MVCCGet_Pebble/batch=false/versions=10/valueSize=8/numRangeKeys=1-24 250kB/s ± 0% 270kB/s ± 0% +8.00% (p=0.016 n=4+5) MVCCGet_Pebble/batch=false/versions=10/valueSize=8/numRangeKeys=100-24 70.0kB/s ± 0% 114.0kB/s ± 5% +62.86% (p=0.008 n=5+5) MVCCGet_Pebble/batch=false/versions=100/valueSize=8/numRangeKeys=0-24 80.0kB/s ± 0% 80.0kB/s ± 0% ~ (all equal) MVCCGet_Pebble/batch=false/versions=100/valueSize=8/numRangeKeys=1-24 70.0kB/s ± 0% 76.0kB/s ± 8% ~ (p=0.167 n=5+5) MVCCGet_Pebble/batch=false/versions=100/valueSize=8/numRangeKeys=100-24 40.0kB/s ± 0% 50.0kB/s ± 0% +25.00% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=0-24 2.14MB/s ± 1% 2.24MB/s ± 1% +4.68% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=1-24 1.33MB/s ± 1% 1.62MB/s ± 2% +21.77% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=100-24 120kB/s ± 0% 280kB/s ± 0% +133.33% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=10/valueSize=8/numRangeKeys=0-24 390kB/s ± 0% 390kB/s ± 0% ~ (all equal) MVCCGet_Pebble/batch=true/versions=10/valueSize=8/numRangeKeys=1-24 310kB/s ± 0% 340kB/s ± 0% +9.68% (p=0.016 n=5+4) MVCCGet_Pebble/batch=true/versions=10/valueSize=8/numRangeKeys=100-24 90.0kB/s ± 0% 160.0kB/s ± 0% +77.78% (p=0.008 n=5+5) MVCCGet_Pebble/batch=true/versions=100/valueSize=8/numRangeKeys=0-24 80.0kB/s ± 0% 80.0kB/s ± 0% ~ (all equal) MVCCGet_Pebble/batch=true/versions=100/valueSize=8/numRangeKeys=1-24 80.0kB/s ± 0% 80.0kB/s ± 0% ~ (all equal) MVCCGet_Pebble/batch=true/versions=100/valueSize=8/numRangeKeys=100-24 44.0kB/s ±14% 60.0kB/s ± 0% +36.36% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=0-24 5.90MB/s ± 3% 6.00MB/s ± 1% ~ (p=0.119 n=5+5) MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=1-24 3.57MB/s ± 1% 3.98MB/s ± 2% +11.53% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=1/valueSize=64/numRangeKeys=100-24 370kB/s ± 0% 678kB/s ± 2% +83.24% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=0-24 4.91MB/s ± 1% 4.90MB/s ± 2% ~ (p=0.730 n=5+5) MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=1-24 3.04MB/s ± 1% 3.36MB/s ± 2% +10.73% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=2/valueSize=64/numRangeKeys=100-24 404kB/s ± 1% 772kB/s ± 3% +91.09% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=0-24 3.15MB/s ± 1% 3.19MB/s ± 2% ~ (p=0.167 n=5+5) MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=1-24 2.09MB/s ± 2% 2.32MB/s ± 1% +10.70% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=10/valueSize=64/numRangeKeys=100-24 400kB/s ± 0% 730kB/s ± 3% +82.50% (p=0.016 n=4+5) MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=0-24 1.54MB/s ± 1% 1.56MB/s ± 1% +1.17% (p=0.048 n=5+5) MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=1-24 1.26MB/s ± 1% 1.32MB/s ± 2% +4.93% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1/versions=100/valueSize=64/numRangeKeys=100-24 460kB/s ± 2% 680kB/s ± 0% +47.83% (p=0.016 n=5+4) MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=0-24 41.0MB/s ± 2% 39.7MB/s ± 1% -3.13% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=1-24 26.6MB/s ± 1% 27.7MB/s ± 2% +4.03% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=1/valueSize=64/numRangeKeys=100-24 5.46MB/s ± 1% 8.17MB/s ± 2% +49.56% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=0-24 31.8MB/s ± 1% 31.4MB/s ± 1% -1.28% (p=0.024 n=5+5) MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=1-24 21.3MB/s ± 1% 22.5MB/s ± 1% +5.52% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=2/valueSize=64/numRangeKeys=100-24 5.85MB/s ± 2% 9.15MB/s ± 1% +56.37% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=0-24 17.0MB/s ± 2% 16.8MB/s ± 1% ~ (p=0.056 n=5+5) MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=1-24 11.5MB/s ± 2% 11.9MB/s ± 1% +2.82% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=10/valueSize=64/numRangeKeys=100-24 4.56MB/s ± 2% 6.33MB/s ± 1% +38.89% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=0-24 6.57MB/s ± 3% 6.66MB/s ± 2% ~ (p=0.087 n=5+5) MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=1-24 5.47MB/s ± 4% 5.58MB/s ± 1% ~ (p=0.135 n=5+5) MVCCScan_Pebble/rows=10/versions=100/valueSize=64/numRangeKeys=100-24 2.97MB/s ± 1% 3.65MB/s ± 4% +22.98% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=0-24 126MB/s ± 1% 120MB/s ± 2% -4.18% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=1-24 85.0MB/s ± 3% 85.6MB/s ± 2% ~ (p=0.548 n=5+5) MVCCScan_Pebble/rows=100/versions=1/valueSize=64/numRangeKeys=100-24 34.7MB/s ± 4% 45.4MB/s ± 1% +30.87% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=0-24 93.4MB/s ± 2% 90.5MB/s ± 1% -3.17% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=1-24 63.8MB/s ± 3% 62.9MB/s ± 1% ~ (p=0.222 n=5+5) MVCCScan_Pebble/rows=100/versions=2/valueSize=64/numRangeKeys=100-24 33.4MB/s ± 1% 42.9MB/s ± 2% +28.42% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=0-24 37.2MB/s ± 0% 36.3MB/s ± 1% -2.42% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=1-24 24.1MB/s ± 2% 23.6MB/s ± 2% -1.84% (p=0.032 n=5+5) MVCCScan_Pebble/rows=100/versions=10/valueSize=64/numRangeKeys=100-24 15.9MB/s ± 3% 17.6MB/s ± 3% +10.73% (p=0.008 n=5+5) MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=0-24 11.1MB/s ± 1% 11.1MB/s ± 1% ~ (p=0.635 n=5+5) MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=1-24 9.09MB/s ± 2% 9.07MB/s ± 2% ~ (p=0.643 n=5+5) MVCCScan_Pebble/rows=100/versions=100/valueSize=64/numRangeKeys=100-24 6.63MB/s ± 2% 6.94MB/s ± 3% +4.68% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=0-24 179MB/s ± 1% 172MB/s ± 1% -4.16% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=1-24 121MB/s ± 2% 117MB/s ± 1% -3.16% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=1/valueSize=64/numRangeKeys=100-24 84.5MB/s ± 1% 92.6MB/s ± 1% +9.56% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=0-24 129MB/s ± 2% 126MB/s ± 3% ~ (p=0.222 n=5+5) MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=1-24 84.4MB/s ± 1% 82.3MB/s ± 1% -2.57% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=2/valueSize=64/numRangeKeys=100-24 66.1MB/s ± 2% 70.8MB/s ± 1% +7.04% (p=0.008 n=5+5) MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=0-24 43.7MB/s ± 4% 42.7MB/s ± 2% ~ (p=0.421 n=5+5) MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=1-24 27.2MB/s ± 2% 27.3MB/s ± 1% ~ (p=0.841 n=5+5) MVCCScan_Pebble/rows=1000/versions=10/valueSize=64/numRangeKeys=100-24 21.6MB/s ± 5% 22.0MB/s ± 2% ~ (p=0.135 n=5+5) MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=0-24 12.5MB/s ± 3% 12.6MB/s ± 3% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=1-24 10.0MB/s ± 2% 10.1MB/s ± 2% ~ (p=0.548 n=5+5) MVCCScan_Pebble/rows=1000/versions=100/valueSize=64/numRangeKeys=100-24 7.89MB/s ± 3% 8.04MB/s ± 5% ~ (p=0.310 n=5+5) MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=0-24 180MB/s ± 1% 190MB/s ± 1% +5.94% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=1-24 120MB/s ± 1% 125MB/s ± 2% +4.06% (p=0.008 n=5+5) MVCCScan_Pebble/rows=10000/versions=1/valueSize=64/numRangeKeys=100-24 101MB/s ± 1% 102MB/s ± 1% +1.60% (p=0.016 n=5+5) MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=0-24 130MB/s ± 2% 131MB/s ± 1% ~ (p=1.000 n=5+5) MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=1-24 86.3MB/s ± 1% 88.2MB/s ± 1% +2.14% (p=0.032 n=5+5) MVCCScan_Pebble/rows=10000/versions=2/valueSize=64/numRangeKeys=100-24 75.5MB/s ± 1% 76.0MB/s ± 1% ~ (p=0.095 n=5+5) MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=0-24 44.7MB/s ± 3% 44.4MB/s ± 1% ~ (p=0.310 n=5+5) MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=1-24 28.2MB/s ± 2% 28.3MB/s ± 2% ~ (p=0.690 n=5+5) MVCCScan_Pebble/rows=10000/versions=10/valueSize=64/numRangeKeys=100-24 23.2MB/s ± 3% 22.9MB/s ± 3% ~ (p=0.548 n=5+5) MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=0-24 12.4MB/s ± 1% 12.7MB/s ± 5% ~ (p=0.151 n=5+5) MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=1-24 10.0MB/s ± 5% 10.2MB/s ± 5% ~ (p=0.643 n=5+5) MVCCScan_Pebble/rows=10000/versions=100/valueSize=64/numRangeKeys=100-24 7.69MB/s ± 7% 7.60MB/s ± 4% ~ (p=0.841 n=5+5) ``` Close #1829. Informs cockroachdb/cockroach#83049. --- internal/keyspan/defragment.go | 58 ++-- internal/keyspan/interleaving_iter.go | 25 +- internal/keyspan/iter.go | 16 +- internal/keyspan/level_iter.go | 4 +- internal/keyspan/merging_iter.go | 285 +++++++++++++++--- internal/keyspan/seek.go | 28 -- internal/keyspan/seek_test.go | 8 +- internal/keyspan/testdata/bounded_iter | 2 +- internal/keyspan/testdata/defragmenting_iter | 26 +- internal/keyspan/testdata/iter | 6 + internal/keyspan/testdata/level_iter | 38 ++- internal/keyspan/testdata/merging_iter | 45 ++- internal/rangekey/coalesce.go | 37 ++- internal/rangekey/testdata/defragmenting_iter | 12 +- internal/rangekey/testdata/iter | 14 +- level_checker.go | 2 +- level_iter_test.go | 2 +- merging_iter.go | 6 +- sstable/block.go | 9 +- 19 files changed, 451 insertions(+), 172 deletions(-) diff --git a/internal/keyspan/defragment.go b/internal/keyspan/defragment.go index 3f534da781..ed0247eb17 100644 --- a/internal/keyspan/defragment.go +++ b/internal/keyspan/defragment.go @@ -119,10 +119,10 @@ const ( // // Seeking (SeekGE, SeekLT) poses an obstacle to defragmentation. A seek may // land on a physical fragment in the middle of several fragments that must be -// defragmented. A seek first degfragments in the opposite direction of -// iteration to find the beginning of the defragmented span, and then -// defragments in the iteration direction, ensuring it's found a whole -// defragmented span. +// defragmented. A seek that lands in a fragment straddling the seek key must +// first degfragment in the opposite direction of iteration to find the +// beginning of the defragmented span, and then defragments in the iteration +// direction, ensuring it's found a whole defragmented span. type DefragmentingIter struct { // DefragmentingBuffers holds buffers used for copying iterator state. *DefragmentingBuffers @@ -205,8 +205,9 @@ func (i *DefragmentingIter) Close() error { return i.iter.Close() } -// SeekGE seeks the iterator to the first span with a start key greater than or -// equal to key and returns it. +// SeekGE moves the iterator to the first span covering a key greater than or +// equal to the given key. This is equivalent to seeking to the first span with +// an end key greater than the given key. func (i *DefragmentingIter) SeekGE(key []byte) *Span { i.iterSpan = i.iter.SeekGE(key) if i.iterSpan == nil { @@ -216,30 +217,28 @@ func (i *DefragmentingIter) SeekGE(key []byte) *Span { i.iterPos = iterPosCurr return i.iterSpan } - // Save the current span and peek backwards. - i.saveCurrent() - i.iterSpan = i.iter.Prev() - if i.iterSpan != nil && i.equal(i.curr.Start, i.iterSpan.End) && i.checkEqual(i.iterSpan, &i.curr) { - // A continuation. The span we originally landed on and defragmented - // backwards has a true Start key < key. To obey the FragmentIterator - // contract, we must not return this defragmented span. Defragment - // forward to finish defragmenting the span in the forward direction. - i.defragmentForward() - - // Now we must be on a span that truly has a defragmented Start key > - // key. + // If the span starts strictly after key, we know there mustn't be an + // earlier span that ends at i.iterSpan.Start, otherwise i.iter would've + // returned that span instead. + if i.comparer.Compare(i.iterSpan.Start, key) > 0 { return i.defragmentForward() } - // The span previous to i.curr does not defragment, so we should return it. - // Next the underlying iterator back onto the span we previously saved to - // i.curr and then defragment forward. - i.iterSpan = i.iter.Next() + // The span we landed on has a Start bound ≤ key. There may be additional + // fragments before this span. Defragment backward to find the start of the + // defragmented span. + i.defragmentBackward() + if i.iterPos == iterPosPrev { + // Next once back onto the span. + i.iterSpan = i.iter.Next() + } + // Defragment the full span from its start. return i.defragmentForward() } -// SeekLT seeks the iterator to the last span with a start key less than -// key and returns it. +// SeekLT moves the iterator to the last span covering a key less than the +// given key. This is equivalent to seeking to the last span with a start +// key less than the given key. func (i *DefragmentingIter) SeekLT(key []byte) *Span { i.iterSpan = i.iter.SeekLT(key) if i.iterSpan == nil { @@ -249,7 +248,16 @@ func (i *DefragmentingIter) SeekLT(key []byte) *Span { i.iterPos = iterPosCurr return i.iterSpan } - // Defragment forward to find the end of the defragmented span. + // If the span ends strictly before key, we know there mustn't be a later + // span that starts at i.iterSpan.End, otherwise i.iter would've returned + // that span instead. + if i.comparer.Compare(i.iterSpan.End, key) < 0 { + return i.defragmentBackward() + } + + // The span we landed on has a End bound ≥ key. There may be additional + // fragments after this span. Defragment forward to find the end of the + // defragmented span. i.defragmentForward() if i.iterPos == iterPosNext { // Prev once back onto the span. diff --git a/internal/keyspan/interleaving_iter.go b/internal/keyspan/interleaving_iter.go index 1c74c6e430..1be219f57b 100644 --- a/internal/keyspan/interleaving_iter.go +++ b/internal/keyspan/interleaving_iter.go @@ -824,27 +824,16 @@ func (i *InterleavingIter) interleaveBackward() (*base.InternalKey, base.LazyVal } } -// keyspanSeekGE seeks the keyspan iterator to the first span covering k ≥ key. -// Note that this differs from the FragmentIterator.SeekGE semantics, which -// seek to the first span with a start key ≥ key. -func (i *InterleavingIter) keyspanSeekGE(key []byte, prefix []byte) { - // Seek using SeekLT to look for a span that starts before key, with an end - // boundary extending beyond key. - i.span = i.keyspanIter.SeekLT(key) - if i.span == nil || i.cmp(i.span.End, key) <= 0 { - // The iterator is exhausted in the reverse direction, or the span we - // found ends before key. Next to the first key with a start ≥ key. - i.span = i.keyspanIter.Next() - } +// keyspanSeekGE seeks the keyspan iterator to the first span covering a key ≥ k. +func (i *InterleavingIter) keyspanSeekGE(k []byte, prefix []byte) { + i.span = i.keyspanIter.SeekGE(k) i.checkForwardBound(prefix) i.savedKeyspan() } -// keyspanSeekLT seeks the keyspan iterator to the last span covering k < key. -// Note that this differs from the FragmentIterator.SeekLT semantics, which -// seek to the last span with a start key < key. -func (i *InterleavingIter) keyspanSeekLT(key []byte) { - i.span = i.keyspanIter.SeekLT(key) +// keyspanSeekLT seeks the keyspan iterator to the last span covering a key < k. +func (i *InterleavingIter) keyspanSeekLT(k []byte) { + i.span = i.keyspanIter.SeekLT(k) i.checkBackwardBound() // The current span's start key is not guaranteed to be less than key, // because of the bounds enforcement. Consider the following example: @@ -857,7 +846,7 @@ func (i *InterleavingIter) keyspanSeekLT(key []byte) { // // This problem is a consequence of the SeekLT's exclusive search key and // the fact that we don't perform bounds truncation at every leaf iterator. - if i.span != nil && i.truncated && i.cmp(i.truncatedSpan.Start, key) >= 0 { + if i.span != nil && i.truncated && i.cmp(i.truncatedSpan.Start, k) >= 0 { i.span = nil } i.savedKeyspan() diff --git a/internal/keyspan/iter.go b/internal/keyspan/iter.go index d7f5068bbc..c568a666be 100644 --- a/internal/keyspan/iter.go +++ b/internal/keyspan/iter.go @@ -18,12 +18,14 @@ import ( // longer lifetimes but implementations need only guarantee stability until the // next positioning method. type FragmentIterator interface { - // SeekGE moves the iterator to the first span whose start key is greater - // than or equal to the given key. + // SeekGE moves the iterator to the first span covering a key greater than + // or equal to the given key. This is equivalent to seeking to the first + // span with an end key greater than the given key. SeekGE(key []byte) *Span - // SeekLT moves the iterator to the last span whose start key is less than - // the given key. + // SeekLT moves the iterator to the last span covering a key less than the + // given key. This is equivalent to seeking to the last span with a start + // key less than the given key. SeekLT(key []byte) *Span // First moves the iterator to the first span. @@ -104,6 +106,9 @@ func (i *Iter) Init(cmp base.Compare, spans []Span) { func (i *Iter) SeekGE(key []byte) *Span { // NB: manually inlined sort.Search is ~5% faster. // + // Define f(j) = true iff the span i.spans[j] is strictly before `key` + // (equivalently, i.spans[j].End ≤ key.) + // // Define f(-1) == false and f(n) == true. // Invariant: f(index-1) == false, f(upper) == true. i.index = 0 @@ -111,12 +116,13 @@ func (i *Iter) SeekGE(key []byte) *Span { for i.index < upper { h := int(uint(i.index+upper) >> 1) // avoid overflow when computing h // i.index ≤ h < upper - if i.cmp(key, i.spans[h].Start) > 0 { + if i.cmp(key, i.spans[h].End) >= 0 { i.index = h + 1 // preserves f(i-1) == false } else { upper = h // preserves f(j) == true } } + // i.index == upper, f(i.index-1) == false, and f(upper) (= f(i.index)) == // true => answer is i.index. if i.index >= len(i.spans) { diff --git a/internal/keyspan/level_iter.go b/internal/keyspan/level_iter.go index e04133359c..3319897149 100644 --- a/internal/keyspan/level_iter.go +++ b/internal/keyspan/level_iter.go @@ -187,6 +187,7 @@ func (l *LevelIter) SeekGE(key []byte) *Span { f := l.findFileGE(key) if f != nil && l.keyType == manifest.KeyTypeRange && l.cmp(key, f.SmallestRangeKey.UserKey) < 0 { prevFile := l.files.Prev() + l.files.Next() if prevFile != nil { // We could unconditionally return an empty span between the seek key and // f.SmallestRangeKey, however if this span is to the left of all range @@ -202,7 +203,6 @@ func (l *LevelIter) SeekGE(key []byte) *Span { // // TODO(bilal): Investigate ways to be able to return straddle spans in // cases similar to the above, while still retaining correctness. - l.files.Next() // Return a straddling key instead of loading the file. l.iterFile = f if err := l.Close(); err != nil { @@ -237,6 +237,7 @@ func (l *LevelIter) SeekLT(key []byte) *Span { f := l.findFileLT(key) if f != nil && l.keyType == manifest.KeyTypeRange && l.cmp(f.LargestRangeKey.UserKey, key) < 0 { nextFile := l.files.Next() + l.files.Prev() if nextFile != nil { // We could unconditionally return an empty span between f.LargestRangeKey // and the seek key, however if this span is to the right of all range keys @@ -252,7 +253,6 @@ func (l *LevelIter) SeekLT(key []byte) *Span { // // TODO(bilal): Investigate ways to be able to return straddle spans in // cases similar to the above, while still retaining correctness. - l.files.Prev() // Return a straddling key instead of loading the file. l.iterFile = f if err := l.Close(); err != nil { diff --git a/internal/keyspan/merging_iter.go b/internal/keyspan/merging_iter.go index f9921408ae..78b080497c 100644 --- a/internal/keyspan/merging_iter.go +++ b/internal/keyspan/merging_iter.go @@ -21,6 +21,10 @@ import ( // seeks would require introducing key comparisons to switchTo{Min,Max}Heap // where there currently are none. +// TODO(jackson): There are several opportunities to use base.Equal in the +// MergingIter implementation, but will require a bit of plumbing to thread the +// Equal function. + // Transformer defines a transformation to be applied to a Span. type Transformer interface { // Transform takes a Span as input and writes the transformed Span to the @@ -354,38 +358,70 @@ func (m *MergingIter) AddLevel(iter FragmentIterator) { m.levels = append(m.levels, mergingIterLevel{iter: iter}) } -// SeekGE moves the iterator to the first span with a start key greater than or -// equal to key. +// SeekGE moves the iterator to the first span covering a key greater than +// or equal to the given key. This is equivalent to seeking to the first +// span with an end key greater than the given key. func (m *MergingIter) SeekGE(key []byte) *Span { m.invalidate() // clear state about current position + + // SeekGE(k) seeks to the first span with an end key greater than the given + // key. The merged span M that we're searching for might straddle the seek + // `key`. In this case, the M.Start may be a key ≤ the seek key. + // + // Consider a SeekGE(dog) in the following example. + // + // i0: b---d e-----h + // i1: a---c h-----k + // i2: a------------------------------p + // merged: a-b-c-d-e-----h-----k----------p + // + // The merged span M containing 'dog' is [d,e). The 'd' of the merged span + // comes from i0's [b,d)'s end boundary. The [b,d) span does not cover any + // key >= dog, so we cannot find the span by positioning the child iterators + // using a SeekGE(dog). + // + // Instead, if we take all the child iterators' spans bounds: + // a b c d e h k p + // We want to partition them into keys ≤ `key` and keys > `key`. + // dog + // │ + // a b c d│e h k p + // │ + // The largest key on the left of the partition forms the merged span's + // start key, and the smallest key on the right of the partition forms the + // merged span's end key. Recharacterized: + // + // M.Start: the largest boundary ≤ k of any child span + // M.End: the smallest boundary > k of any child span + // + // The FragmentIterator interface doesn't implement seeking by all bounds, + // it implements seeking by containment. A SeekGE(k) will ensure we observe + // all start boundaries ≥ k and all end boundaries > k but does not ensure + // we observe end boundaries = k or any boundaries < k. A SeekLT(k) will + // ensure we observe all start boundaries < k and all end boundaries ≤ k but + // does not ensure we observe any start boundaries = k or any boundaries > + // k. This forces us to seek in one direction and step in the other. + // + // In a SeekGE, we want to end up oriented in the forward direction when + // complete, so we begin with searching for M.Start by SeekLT-ing every + // child iterator to `k`. For every child span found, we determine the + // largest bound ≤ `k` and use it to initialize our max heap. The resulting + // root of the max heap is a preliminary value for `M.Start`. for i := range m.levels { l := &m.levels[i] - - // A SeekGE requires we position each level at the smallest bound ≥ key. - // We must search through both inclusive start and exclusive end bounds. - // Note that this search requirement differs from FragmentIterator's - // .SeekGE'semantics, which returns the span with the smallest start key - // ≥ key. To remedy this difference, we find the last span less than - // key. If its end boundary is greater than or equal to key, we use it. - // Otherwise we use the start boundary of the next span which - // necessarily has a start ≥ key. s := l.iter.SeekLT(key) - if s != nil && m.cmp(s.End, key) >= 0 { - // s.End ≥ key - // We need to use this span's end bound. + if s == nil { + l.heapKey = boundKey{kind: boundKindInvalid} + } else if m.cmp(s.End, key) <= 0 { l.heapKey = boundKey{ kind: boundKindFragmentEnd, key: s.End, span: s, } - continue - } - // s.End < key - // The span `s` ends before key. Next to the first span with a Start ≥ - // key, and use that. - if s = l.iter.Next(); s == nil { - l.heapKey = boundKey{kind: boundKindInvalid} } else { + // s.End > key + // We need to use this span's start bound, since that's the largest + // bound ≤ key. l.heapKey = boundKey{ kind: boundKindFragmentStart, key: s.Start, @@ -393,36 +429,201 @@ func (m *MergingIter) SeekGE(key []byte) *Span { } } } - m.initMinHeap() + m.initMaxHeap() + if m.err != nil { + return nil + } else if len(m.heap.items) == 0 { + // There are no spans covering any key < `key`. There is no span that + // straddles the seek key. Reorient the heap into a min heap and return + // the first span we find in the forward direction. + m.switchToMinHeap() + return m.findNextFragmentSet() + } + + // The heap root is now the largest boundary key b such that: + // 1. b < k + // 2. b = k, and b is an end boundary + // There's a third case that we will need to consider later, after we've + // switched to a min heap: + // 3. there exists a start boundary key b such that b = k. + // A start boundary key equal to k would not be surfaced when we seeked all + // the levels using SeekLT(k), since no key `key`, which will serve as our candidate end + // bound. + m.switchToMinHeap() + if m.err != nil { + return nil + } else if len(m.heap.items) == 0 { + return nil + } + + // Check for the case 3 described above. It's possible that when we switch + // heap directions, we discover a start boundary of some child span that is + // equal to the seek key `key`. In this case, we want this key to be our + // start boundary. + if m.heap.items[0].boundKey.kind == boundKindFragmentStart && + m.cmp(m.heap.items[0].boundKey.key, key) == 0 { + // Call findNextFragmentSet, which will set m.start to the heap root and + // proceed forward. + return m.findNextFragmentSet() + } + + m.end = m.heap.items[0].boundKey.key + if found, s := m.synthesizeKeys(+1); found && s != nil { + return s + } return m.findNextFragmentSet() + } -// SeekLT moves the iterator to the last span with a start key less than key. +// SeekLT moves the iterator to the last span covering a key less than the +// given key. This is equivalent to seeking to the last span with a start +// key less than the given key. func (m *MergingIter) SeekLT(key []byte) *Span { - // TODO(jackson): Evaluate whether there's an implementation of SeekLT - // independent of SeekGE that is more efficient. It's tricky, because the - // span we should return might straddle `key` itself. + m.invalidate() // clear state about current position + + // SeekLT(k) seeks to the last span with a start key less than the given + // key. The merged span M that we're searching for might straddle the seek + // `key`. In this case, the M.Start may be a key ≤ the seek key. + // + // Consider a SeekLT(dog) in the following example. // - // Consider the scenario: - // a----------l #2 - // b-----------m #1 + // i0: b---d e-----h + // i1: a---c h-----k + // i2: a------------------------------p + // merged: a-b-c-d-e-----h-----k----------p // - // The merged, fully-fragmented spans that MergingIter exposes to the caller - // have bounds: - // a-b #2 - // b--------l #2 - // b--------l #1 - // l-m #1 + // The merged span M containing the largest key <'dog' is [d,e). The 'e' of + // the merged span comes from i0's [e,h)'s start boundary. The [e,h) span + // does not cover any key < dog, so we cannot find the span by positioning + // the child iterators using a SeekLT(dog). // - // A call SeekLT(c) must return the largest of the above spans with a - // Start user key < key: [b,l)#1. This requires examining bounds both < 'c' - // (the 'b' of [b,m)#1's start key) and bounds ≥ 'c' (the 'l' of ([a,l)#2's - // end key). - if s := m.SeekGE(key); s == nil && m.err != nil { + // Instead, if we take all the child iterators' spans bounds: + // a b c d e h k p + // We want to partition them into keys < `key` and keys ≥ `key`. + // dog + // │ + // a b c d│e h k p + // │ + // The largest key on the left of the partition forms the merged span's + // start key, and the smallest key on the right of the partition forms the + // merged span's end key. Recharacterized: + // + // M.Start: the largest boundary < k of any child span + // M.End: the smallest boundary ≥ k of any child span + // + // The FragmentIterator interface doesn't implement seeking by all bounds, + // it implements seeking by containment. A SeekGE(k) will ensure we observe + // all start boundaries ≥ k and all end boundaries > k but does not ensure + // we observe end boundaries = k or any boundaries < k. A SeekLT(k) will + // ensure we observe all start boundaries < k and all end boundaries ≤ k but + // does not ensure we observe any start boundaries = k or any boundaries > + // k. This forces us to seek in one direction and step in the other. + // + // In a SeekLT, we want to end up oriented in the backward direction when + // complete, so we begin with searching for M.End by SeekGE-ing every + // child iterator to `k`. For every child span found, we determine the + // smallest bound ≥ `k` and use it to initialize our min heap. The resulting + // root of the min heap is a preliminary value for `M.End`. + for i := range m.levels { + l := &m.levels[i] + s := l.iter.SeekGE(key) + if s == nil { + l.heapKey = boundKey{kind: boundKindInvalid} + } else if m.cmp(s.Start, key) >= 0 { + l.heapKey = boundKey{ + kind: boundKindFragmentStart, + key: s.Start, + span: s, + } + } else { + // s.Start < key + // We need to use this span's end bound, since that's the smallest + // bound ≥ key. + l.heapKey = boundKey{ + kind: boundKindFragmentEnd, + key: s.End, + span: s, + } + } + } + m.initMinHeap() + if m.err != nil { return nil + } else if len(m.heap.items) == 0 { + // There are no spans covering any key ≥ `key`. There is no span that + // straddles the seek key. Reorient the heap into a max heap and return + // the first span we find in the reverse direction. + m.switchToMaxHeap() + return m.findPrevFragmentSet() } - // Prev to the previous span. - return m.Prev() + + // The heap root is now the smallest boundary key b such that: + // 1. b > k + // 2. b = k, and b is a start boundary + // There's a third case that we will need to consider later, after we've + // switched to a max heap: + // 3. there exists an end boundary key b such that b = k. + // An end boundary key equal to k would not be surfaced when we seeked all + // the levels using SeekGE(k), since k would not be contained within the + // exclusive end boundary. + // + // Assume that the tightest end boundary ≥ k is the current heap root (cases + // 1 & 2). After we switch to a max heap, we'll check for the third case and + // adjust the end boundary if necessary. + m.end = m.heap.items[0].boundKey.key + + // Before switching the direction of the heap, save a copy of the end + // boundary if it's the start boundary of some child span. Prev-ing the + // child iterator might switch files and invalidate the memory of the bound. + if m.heap.items[0].boundKey.kind == boundKindFragmentStart { + m.buf = append(m.buf[:0], m.end...) + m.end = m.buf + } + + // Switch to a max heap. This will move each level to the previous bound in + // every level, and then establish a max heap. This allows us to obtain the + // largest boundary key < `key`, which will serve as our candidate start + // bound. + m.switchToMaxHeap() + if m.err != nil { + return nil + } else if len(m.heap.items) == 0 { + return nil + } + // Check for the case 3 described above. It's possible that when we switch + // heap directions, we discover an end boundary of some child span that is + // equal to the seek key `key`. In this case, we want this key to be our end + // boundary. + if m.heap.items[0].boundKey.kind == boundKindFragmentEnd && + m.cmp(m.heap.items[0].boundKey.key, key) == 0 { + // Call findPrevFragmentSet, which will set m.end to the heap root and + // proceed backwards. + return m.findPrevFragmentSet() + } + + m.start = m.heap.items[0].boundKey.key + if found, s := m.synthesizeKeys(-1); found && s != nil { + return s + } + return m.findPrevFragmentSet() } // First seeks the iterator to the first span. diff --git a/internal/keyspan/seek.go b/internal/keyspan/seek.go index 59a4884e02..efcf682889 100644 --- a/internal/keyspan/seek.go +++ b/internal/keyspan/seek.go @@ -6,34 +6,6 @@ package keyspan import "github.com/cockroachdb/pebble/internal/base" -// SeekGE seeks to the span that contains the target key or the first span past -// the target key. -func SeekGE(cmp base.Compare, iter FragmentIterator, key []byte) *Span { - // NB: We use SeekLT in order to land on the proper span for a search - // key that resides in the middle of a span. Consider the scenario: - // - // a---e - // e---i - // - // The spans are indexed by their start keys `a` and `e`. If the - // search key is `c` we want to land on the span [a,e). If we were to - // use SeekGE then the search key `c` would land on the span [e,i) and - // we'd have to backtrack. The one complexity here is what happens for the - // search key `e`. In that case SeekLT will land us on the span [a,e) - // and we'll have to move forward. - iterSpan := iter.SeekLT(key) - - // Invariant: key > iterSpan.Start - - if iterSpan == nil || cmp(key, iterSpan.End) >= 0 { - // The current span lies entirely before the search key, or the iterator - // is exhausted. Advance the iterator to the next span which is - // guaranteed to lie at or past the search key. - iterSpan = iter.Next() - } - return iterSpan -} - // SeekLE seeks to the span that contains or is before the target key. func SeekLE(cmp base.Compare, iter FragmentIterator, key []byte) *Span { // NB: We use SeekLT in order to land on the proper span for a search diff --git a/internal/keyspan/seek_test.go b/internal/keyspan/seek_test.go index 3acd77e4cc..aa1d64306b 100644 --- a/internal/keyspan/seek_test.go +++ b/internal/keyspan/seek_test.go @@ -32,9 +32,11 @@ func TestSeek(t *testing.T) { iter = NewIter(cmp, spans) return buf.String() case "seek-ge", "seek-le": - seek := SeekGE - if d.Cmd == "seek-le" { - seek = SeekLE + seek := SeekLE + if d.Cmd == "seek-ge" { + seek = func(_ base.Compare, iter FragmentIterator, key []byte) *Span { + return iter.SeekGE(key) + } } for _, line := range strings.Split(d.Input, "\n") { diff --git a/internal/keyspan/testdata/bounded_iter b/internal/keyspan/testdata/bounded_iter index 67dd143f66..8532f620ce 100644 --- a/internal/keyspan/testdata/bounded_iter +++ b/internal/keyspan/testdata/bounded_iter @@ -129,9 +129,9 @@ seek-ge bar prev prev ---- - b-g:{(#4,RANGEKEYSET,@3)} + # Test seeking to a portion of the keyspace that contains a range key with a # start bound < the seek key, and the range key also overlaps the current diff --git a/internal/keyspan/testdata/defragmenting_iter b/internal/keyspan/testdata/defragmenting_iter index d70448dfcd..8f3765b730 100644 --- a/internal/keyspan/testdata/defragmenting_iter +++ b/internal/keyspan/testdata/defragmenting_iter @@ -77,10 +77,10 @@ next seeklt d prev ---- -seekge b f-t:{(#3,RANGEKEYSET,@3,bananas)} -prev a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} -seekge b f-t:{(#3,RANGEKEYSET,@3,bananas)} -next . +seekge b a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} +prev . +seekge b a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} +next f-t:{(#3,RANGEKEYSET,@3,bananas)} seeklt d a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} next f-t:{(#3,RANGEKEYSET,@3,bananas)} seeklt d a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} @@ -111,33 +111,41 @@ prev next next ---- -seekge r t-z:{(#4,RANGEKEYSET,@2,oranges)} -prev f-t:{(#3,RANGEKEYSET,@3,bananas)} +seekge r f-t:{(#3,RANGEKEYSET,@3,bananas)} +prev a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} +next f-t:{(#3,RANGEKEYSET,@3,bananas)} next t-z:{(#4,RANGEKEYSET,@2,oranges)} -next . iter seekge f seekge h seekge p seekge t +seekge u +seekge v +seekge z ---- seekge f f-t:{(#3,RANGEKEYSET,@3,bananas)} -seekge h t-z:{(#4,RANGEKEYSET,@2,oranges)} -seekge p t-z:{(#4,RANGEKEYSET,@2,oranges)} +seekge h f-t:{(#3,RANGEKEYSET,@3,bananas)} +seekge p f-t:{(#3,RANGEKEYSET,@3,bananas)} seekge t t-z:{(#4,RANGEKEYSET,@2,oranges)} +seekge u t-z:{(#4,RANGEKEYSET,@2,oranges)} +seekge v t-z:{(#4,RANGEKEYSET,@2,oranges)} +seekge z . iter seeklt f seeklt h seeklt p seeklt t +seeklt u seeklt z ---- seeklt f a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)} seeklt h f-t:{(#3,RANGEKEYSET,@3,bananas)} seeklt p f-t:{(#3,RANGEKEYSET,@3,bananas)} seeklt t f-t:{(#3,RANGEKEYSET,@3,bananas)} +seeklt u t-z:{(#4,RANGEKEYSET,@2,oranges)} seeklt z t-z:{(#4,RANGEKEYSET,@2,oranges)} # Test iteration with a reducer that collects keys across all spans that diff --git a/internal/keyspan/testdata/iter b/internal/keyspan/testdata/iter index 317f5e0bc5..5a1c4511e8 100644 --- a/internal/keyspan/testdata/iter +++ b/internal/keyspan/testdata/iter @@ -8,20 +8,26 @@ iter seek-ge a seek-ge b seek-ge c +seek-ge cat seek-ge d seek-lt a seek-lt b seek-lt c +seek-lt cat seek-lt d +seek-lt e ---- a-b:{(#2,SET) (#1,SET)} b-c:{(#2,SET) (#1,SET)} c-d:{(#2,SET) (#1,SET)} +c-d:{(#2,SET) (#1,SET)} . . a-b:{(#2,SET) (#1,SET)} b-c:{(#2,SET) (#1,SET)} c-d:{(#2,SET) (#1,SET)} +c-d:{(#2,SET) (#1,SET)} +c-d:{(#2,SET) (#1,SET)} iter first diff --git a/internal/keyspan/testdata/level_iter b/internal/keyspan/testdata/level_iter index eaa3cb5c19..3919819bc5 100644 --- a/internal/keyspan/testdata/level_iter +++ b/internal/keyspan/testdata/level_iter @@ -11,22 +11,54 @@ file iter seek-ge a +seek-ge apple seek-ge b +seek-ge banana seek-ge c +seek-ge cantalope seek-ge d +seek-ge dragonfruit +---- +a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst) +a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst) +b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +. +. + +iter seek-lt a +seek-lt apple seek-lt b +seek-lt banana seek-lt c +seek-lt cantalope seek-lt d +seek-lt dragonfruit +prev ---- +. +a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst) a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst) b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) -. -. -a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst) b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) + +iter +seek-ge a +prev +seek-lt d +next +---- +a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst) +. c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst) +. iter first diff --git a/internal/keyspan/testdata/merging_iter b/internal/keyspan/testdata/merging_iter index fdb67f4005..90fc382807 100644 --- a/internal/keyspan/testdata/merging_iter +++ b/internal/keyspan/testdata/merging_iter @@ -237,36 +237,46 @@ h-k:{(#5,RANGEKEYDEL)} e-h:{(#8,RANGEKEYDEL)} # Test SeekGE. Note that MergingIter's SeekGE implements the FragmentIterator's -# SeekGE semantics. It returns the first fragment with a Start key ≥ the search -# key, NOT the first fragment that covers a key ≥ the search key. +# SeekGE semantics. It returns the first fragment that covers a key ≥ the search +# key. iter seek-ge cc ---- -e-h:{(#8,RANGEKEYDEL)} +c-d:{(#10,RANGEKEYSET,@1,apples)} iter seek-ge 1 seek-ge a seek-ge b seek-ge bb +---- +a-b:{(#3,RANGEKEYUNSET,@1)} +a-b:{(#3,RANGEKEYUNSET,@1)} +b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} +b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} + +iter seek-ge c seek-ge cc seek-ge e seek-ge f -seek-ge h -seek-ge i ---- -a-b:{(#3,RANGEKEYUNSET,@1)} -a-b:{(#3,RANGEKEYUNSET,@1)} -b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} c-d:{(#10,RANGEKEYSET,@1,apples)} c-d:{(#10,RANGEKEYSET,@1,apples)} e-h:{(#8,RANGEKEYDEL)} e-h:{(#8,RANGEKEYDEL)} + +iter +seek-ge h +seek-ge i +seek-ge k +seek-ge l +---- h-k:{(#5,RANGEKEYDEL)} h-k:{(#5,RANGEKEYDEL)} + # Test SeekLT. Note that MergingIter's SeekLT implements the FragmentIterator's # SeekLT semantics. It returns the first fragment with a Start key < the search @@ -287,6 +297,15 @@ seek-lt aa seek-lt b seek-lt bb seek-lt c +---- + + +a-b:{(#3,RANGEKEYUNSET,@1)} +a-b:{(#3,RANGEKEYUNSET,@1)} +b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} +b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} + +iter seek-lt cc seek-lt d seek-lt dd @@ -297,12 +316,6 @@ seek-lt hh seek-lt k seek-lt z ---- - - -a-b:{(#3,RANGEKEYUNSET,@1)} -a-b:{(#3,RANGEKEYUNSET,@1)} -b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} -b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)} c-d:{(#10,RANGEKEYSET,@1,apples)} c-d:{(#10,RANGEKEYSET,@1,apples)} c-d:{(#10,RANGEKEYSET,@1,apples)} @@ -368,8 +381,8 @@ prev ---- x-y:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#1,RANGEKEYDEL)} w-x:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)} -y-z:{(#5,RANGEKEYDEL)} x-y:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#1,RANGEKEYDEL)} +w-x:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)} define il-qb:{(#10,RANGEKEYDEL)} @@ -390,9 +403,11 @@ next next next next +next seek-ge yz prev ---- +qb-rf:{(#8,RANGEKEYDEL) (#7,RANGEKEYDEL)} rf-sn:{(#8,RANGEKEYDEL) (#7,RANGEKEYDEL) (#4,RANGEKEYDEL)} sn-sv:{(#10,RANGEKEYDEL) (#8,RANGEKEYDEL) (#7,RANGEKEYDEL) (#4,RANGEKEYDEL)} sv-wn:{(#10,RANGEKEYDEL) (#4,RANGEKEYDEL)} diff --git a/internal/rangekey/coalesce.go b/internal/rangekey/coalesce.go index 08a60af4df..f641245e30 100644 --- a/internal/rangekey/coalesce.go +++ b/internal/rangekey/coalesce.go @@ -15,7 +15,42 @@ import ( ) // UserIteratorConfig holds state for constructing the range key iterator stack -// for user iteration. +// for user iteration. The range key iterator must merge range key spans across +// the levels of the LSM. This merging is performed by a keyspan.MergingIter +// on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating +// range-key semantics and shadowing, so the spans returned by a MergingIter are +// fully resolved. +// +// The MergingIter is wrapped by a BoundedIter, which elides spans that are +// outside the iterator bounds (or the current prefix's bounds, during prefix +// iteration mode). +// +// To provide determinisim during iteration, the BoundedIter is wrapped by a +// DefragmentingIter that defragments abutting spans with identical +// user-observable state. +// +// At the top-level an InterleavingIter interleaves range keys with point keys +// and performs truncation to iterator bounds. +// +// Below is an abbreviated diagram illustrating the mechanics of a SeekGE. +// +// InterleavingIter.SeekGE +// │ +// DefragmentingIter.SeekGE +// │ +// BoundedIter.SeekGE +// │ +// ╭────────────────┴───────────────╮ +// │ ├── defragmentBwd* +// MergingIter.SeekGE │ +// │ ╰── defragmentFwd +// ╰─╶╶ per level╶╶ ─╮ +// │ +// │ +// ├── .SeekLT +// │ +// ╰── .Next +// type UserIteratorConfig struct { snapshot uint64 comparer *base.Comparer diff --git a/internal/rangekey/testdata/defragmenting_iter b/internal/rangekey/testdata/defragmenting_iter index 276c7ab21f..63f21779a6 100644 --- a/internal/rangekey/testdata/defragmenting_iter +++ b/internal/rangekey/testdata/defragmenting_iter @@ -41,9 +41,9 @@ next seeklt d prev ---- -seekge b . -prev a-t:{(#3,RANGEKEYSET,@3,bananas)} -seekge b . +seekge b a-t:{(#1,RANGEKEYSET,@3,bananas)} +prev . +seekge b a-t:{(#1,RANGEKEYSET,@3,bananas)} next . seeklt d a-t:{(#3,RANGEKEYSET,@3,bananas)} next . @@ -75,7 +75,7 @@ prev next next ---- -seekge r t-z:{(#4,RANGEKEYSET,@2,oranges)} -prev a-t:{(#3,RANGEKEYSET,@3,bananas)} +seekge r a-t:{(#1,RANGEKEYSET,@3,bananas)} +prev . +next a-t:{(#1,RANGEKEYSET,@3,bananas)} next t-z:{(#4,RANGEKEYSET,@2,oranges)} -next . diff --git a/internal/rangekey/testdata/iter b/internal/rangekey/testdata/iter index 73ebad8b22..a26f62efe0 100644 --- a/internal/rangekey/testdata/iter +++ b/internal/rangekey/testdata/iter @@ -49,11 +49,11 @@ next next next ---- -e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)} +c-d:{(#4,RANGEKEYSET,@3,coconut)} +a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)} c-d:{(#4,RANGEKEYSET,@3,coconut)} e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)} h-j:{(#22,RANGEKEYDEL)} -l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)} iter seek-ge c @@ -77,7 +77,7 @@ next next next ---- -e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)} +c-d:{(#4,RANGEKEYSET,@3,coconut)} c-d:{(#4,RANGEKEYSET,@3,coconut)} a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)} @@ -109,8 +109,8 @@ seek-ge bat seek-ge c ---- a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)} -c-d:{(#4,RANGEKEYSET,@3,coconut)} -c-d:{(#4,RANGEKEYSET,@3,coconut)} +a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)} +a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)} c-d:{(#4,RANGEKEYSET,@3,coconut)} iter @@ -118,7 +118,7 @@ seek-ge 1 seek-ge c1 ---- a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)} -e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)} +c-d:{(#4,RANGEKEYSET,@3,coconut)} iter seek-ge zoo @@ -131,7 +131,7 @@ seek-ge yeti q-z:{(#14,RANGEKEYSET,@9,mangos)} q-z:{(#14,RANGEKEYSET,@9,mangos)} - +q-z:{(#14,RANGEKEYSET,@9,mangos)} iter seek-ge h diff --git a/level_checker.go b/level_checker.go index 31fcaf0ddc..29dbbe8a8d 100644 --- a/level_checker.go +++ b/level_checker.go @@ -117,7 +117,7 @@ func (m *simpleMergingIter) positionRangeDels() { if l.rangeDelIter == nil { continue } - l.tombstone = keyspan.SeekGE(m.heap.cmp, l.rangeDelIter, item.key.UserKey) + l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey) } } diff --git a/level_iter_test.go b/level_iter_test.go index 90f9e87cbd..ed4fc7a237 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -361,7 +361,7 @@ func (i *levelIterTestIter) rangeDelSeek( if dir < 0 { t = keyspan.SeekLE(i.levelIter.cmp, i.rangeDelIter, key) } else { - t = keyspan.SeekGE(i.levelIter.cmp, i.rangeDelIter, key) + t = i.rangeDelIter.SeekGE(key) } if t != nil { tombstone = t.Visible(1000) diff --git a/merging_iter.go b/merging_iter.go index 9910f20923..f00415e6cb 100644 --- a/merging_iter.go +++ b/merging_iter.go @@ -358,7 +358,7 @@ func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) { if l.rangeDelIter == nil { continue } - l.tombstone = keyspan.SeekGE(m.heap.cmp, l.rangeDelIter, item.key.UserKey) + l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey) } } @@ -673,7 +673,7 @@ func (m *mergingIter) isNextEntryDeleted(item *mergingIterItem) bool { // levelIter in the future cannot contain item.key). Also, it is possible that we // will encounter parts of the range delete that should be ignored -- we handle that // below. - l.tombstone = keyspan.SeekGE(m.heap.cmp, l.rangeDelIter, item.key.UserKey) + l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey) } if l.tombstone == nil { continue @@ -1073,7 +1073,7 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { // so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the // tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past // this sstable since it realizes the largest key is a InternalRangeDelSentinel. - l.tombstone = keyspan.SeekGE(m.heap.cmp, rangeDelIter, key) + l.tombstone = rangeDelIter.SeekGE(key) if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) && (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) { // NB: Based on the comment above l.largestUserKey >= key, and based on the diff --git a/sstable/block.go b/sstable/block.go index 10261533c5..9b7dc90b32 100644 --- a/sstable/block.go +++ b/sstable/block.go @@ -1391,8 +1391,13 @@ func (i *fragmentBlockIter) Prev() *keyspan.Span { // SeekGE implements (keyspan.FragmentIterator).SeekGE. func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span { - i.dir = +1 - return i.gatherForward(i.blockIter.SeekGE(k, base.SeekGEFlags(0))) + if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 { + return s + } + // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span + // doesn't meet the k < s.End comparison, then there's no need for the + // SeekLT to gatherBackward. + return i.Next() } // SeekLT implements (keyspan.FragmentIterator).SeekLT.