From fdc704526d554e4a4d5cfab58608f01618548b01 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 10 Jan 2022 02:34:35 -0500 Subject: [PATCH] kv: re-enable time-bound iterators for RefreshRange request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #53348. The other two requests mentioned in that issue (`ResolveIntentRange` and `EndTxn`) would no longer benefit from time-bound iterators because, thanks to b5213fd, they no longer scan the MVCC keyspace. Transaction refreshing is a form of an optimistic concurrency control validation phase. Before a transaction can commit, if it will be committing at a timestamp higher than its original timestamp, it issues point and ranged refresh requests to the key spans it had previous read. The refresh requests scan a span of keys and determine whether any new values have been written since the transaction originally read the keys. The use of time-bound iterators is an important optimization for ranged refresh operations because we expect very few new writes between the time that a transaction originally reads and the time that it refreshes. Without this optimization, each refresh was redoing all of a transaction's reads at their original cost. This effectively doubled the cost of reads for transactions that had to refresh (or worse for those that refreshed multiple times). With this optimization, refreshing a span of keys is expected to be significantly cheaper than the original scan over that span of keys, because it can ignore most files in the lower levels of the LSM. RefreshRange requests were originally built to use time-bound iterators. However, this optimization was disabled in 1eb3b2a due to concerns about correctness. Since then, then correctness concerns have been addressed and we have begun using time-bound iterators in a handful of places. This commit re-enables time-bound iterators for `RefreshRange` requests. It does so by using `MVCCIncrementalIterator`, which was enhanced to support additional "intent policies" in 87c7f11. This commit uses the "emit" intent policy so that `RefreshRange` will observe all values and all intents in the refresh time window. ---- Microbenchmarks: ``` name old time/op new time/op delta RefreshRange/linear-keys/refresh_window=[95.00,99.00]-10 230ms ± 1% 0ms ± 1% -99.99% (p=0.000 n=9+9) RefreshRange/linear-keys/refresh_window=[75.00,99.00]-10 185ms ± 1% 0ms ± 1% -99.99% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[75.00,95.00]-10 185ms ± 1% 0ms ± 1% -99.99% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[50.00,75.00]-10 123ms ± 1% 0ms ± 2% -99.99% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[50.00,95.00]-10 123ms ± 1% 0ms ± 2% -99.99% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[50.00,99.00]-10 123ms ± 0% 0ms ± 1% -99.99% (p=0.000 n=10+8) RefreshRange/linear-keys/refresh_window=[99.00,99.00]-10 240ms ± 1% 0ms ± 1% -99.96% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[95.00,95.00]-10 237ms ± 1% 0ms ± 2% -99.96% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[75.00,75.00]-10 224ms ± 0% 0ms ± 1% -99.95% (p=0.000 n=9+9) RefreshRange/linear-keys/refresh_window=[50.00,50.00]-10 207ms ± 1% 0ms ± 3% -99.95% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,0.00]-10 174ms ± 1% 0ms ± 1% -99.93% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,0.00]-10 189ms ± 1% 0ms ± 1% -99.86% (p=0.000 n=9+9) RefreshRange/mixed-case/refresh_window=[0.00,0.00]-10 184ms ± 0% 0ms ± 0% -99.85% (p=0.000 n=8+9) RefreshRange/mixed-case/refresh_window=[95.00,95.00]-10 252ms ± 0% 1ms ± 2% -99.70% (p=0.000 n=8+10) RefreshRange/random-keys/refresh_window=[0.00,50.00]-10 412µs ± 1% 13µs ± 2% -96.83% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,50.00]-10 413µs ± 1% 13µs ± 1% -96.78% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,75.00]-10 292µs ± 1% 13µs ± 1% -95.43% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,95.00]-10 245µs ± 1% 13µs ± 2% -94.64% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,95.00]-10 245µs ± 0% 14µs ± 1% -94.49% (p=0.000 n=9+10) RefreshRange/random-keys/refresh_window=[0.00,99.00]-10 238µs ± 1% 13µs ± 1% -94.48% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[95.00,99.00]-10 237µs ± 1% 13µs ± 2% -94.42% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,99.00]-10 237µs ± 2% 14µs ± 1% -94.29% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[50.00,75.00]-10 292µs ± 1% 17µs ± 1% -94.07% (p=0.000 n=10+9) RefreshRange/linear-keys/refresh_window=[0.00,75.00]-10 225µs ± 2% 14µs ± 1% -94.00% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,99.00]-10 224µs ± 1% 13µs ± 1% -93.99% (p=0.000 n=10+8) RefreshRange/linear-keys/refresh_window=[0.00,95.00]-10 224µs ± 1% 14µs ± 1% -93.95% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[75.00,95.00]-10 244µs ± 0% 15µs ± 1% -93.86% (p=0.000 n=7+10) RefreshRange/mixed-case/refresh_window=[0.00,99.00]-10 237µs ± 1% 15µs ± 1% -93.82% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,50.00]-10 224µs ± 1% 14µs ± 1% -93.76% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,95.00]-10 244µs ± 1% 15µs ± 1% -93.75% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[75.00,99.00]-10 238µs ± 1% 15µs ± 1% -93.72% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[75.00,99.00]-10 236µs ± 1% 15µs ± 2% -93.64% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[50.00,99.00]-10 236µs ± 1% 15µs ± 1% -93.63% (p=0.000 n=10+9) RefreshRange/mixed-case/refresh_window=[50.00,95.00]-10 244µs ± 0% 16µs ± 1% -93.58% (p=0.000 n=10+9) RefreshRange/mixed-case/refresh_window=[75.00,95.00]-10 244µs ± 1% 16µs ± 0% -93.55% (p=0.000 n=10+8) RefreshRange/random-keys/refresh_window=[0.00,75.00]-10 287µs ± 1% 19µs ± 1% -93.20% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[95.00,99.00]-10 237µs ± 1% 17µs ± 1% -92.69% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,75.00]-10 288µs ± 2% 23µs ± 1% -91.98% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[99.00,99.00]-10 255ms ± 1% 122ms ± 1% -52.02% (p=0.000 n=9+9) RefreshRange/random-keys/refresh_window=[75.00,75.00]-10 242ms ± 1% 152ms ± 1% -37.02% (p=0.000 n=10+9) RefreshRange/random-keys/refresh_window=[99.00,99.00]-10 259ms ± 0% 354ms ± 1% +36.73% (p=0.000 n=7+9) RefreshRange/random-keys/refresh_window=[95.00,95.00]-10 256ms ± 1% 353ms ± 1% +37.65% (p=0.000 n=10+9) RefreshRange/mixed-case/refresh_window=[75.00,75.00]-10 242ms ± 0% 398ms ± 1% +64.38% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[50.00,50.00]-10 227ms ± 0% 392ms ± 1% +72.65% (p=0.000 n=9+10) RefreshRange/random-keys/refresh_window=[50.00,50.00]-10 229ms ± 1% 512ms ± 1% +123.45% (p=0.000 n=9+9) name old alloc/op new alloc/op delta RefreshRange/linear-keys/refresh_window=[99.00,99.00]-10 195MB ± 0% 0MB ± 0% -100.00% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[95.00,95.00]-10 188MB ± 0% 0MB ± 0% -100.00% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[95.00,95.00]-10 188MB ± 0% 0MB ± 1% -100.00% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[75.00,75.00]-10 148MB ± 0% 0MB ± 0% -100.00% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[50.00,50.00]-10 98.8MB ± 0% 0.0MB ± 0% -100.00% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[99.00,99.00]-10 195MB ± 0% 0MB ± 3% -100.00% (p=0.000 n=9+8) RefreshRange/linear-keys/refresh_window=[95.00,99.00]-10 188MB ± 0% 0MB ± 0% -100.00% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[75.00,95.00]-10 148MB ± 0% 0MB ± 0% -99.99% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[75.00,99.00]-10 148MB ± 0% 0MB ± 0% -99.99% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[50.00,75.00]-10 99.0MB ± 0% 0.0MB ± 0% -99.99% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[50.00,95.00]-10 99.0MB ± 0% 0.0MB ± 0% -99.99% (p=0.000 n=8+9) RefreshRange/linear-keys/refresh_window=[50.00,99.00]-10 99.0MB ± 0% 0.0MB ± 0% -99.99% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[75.00,75.00]-10 148MB ± 0% 0MB ±29% -99.99% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[75.00,75.00]-10 148MB ± 0% 0MB ± 6% -99.98% (p=0.000 n=10+9) RefreshRange/mixed-case/refresh_window=[50.00,50.00]-10 98.7MB ± 0% 0.0MB ± 1% -99.98% (p=0.000 n=10+8) RefreshRange/random-keys/refresh_window=[99.00,99.00]-10 195MB ± 0% 0MB ± 4% -99.97% (p=0.000 n=9+8) RefreshRange/random-keys/refresh_window=[95.00,95.00]-10 188MB ± 0% 0MB ± 3% -99.97% (p=0.000 n=10+8) RefreshRange/random-keys/refresh_window=[50.00,50.00]-10 98.7MB ± 0% 0.1MB ±12% -99.92% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,0.00]-10 41.9kB ± 5% 1.1kB ± 0% -97.27% (p=0.000 n=9+9) RefreshRange/linear-keys/refresh_window=[0.00,50.00]-10 208kB ± 0% 8kB ± 0% -96.04% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[95.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[95.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=10+9) RefreshRange/mixed-case/refresh_window=[50.00,75.00]-10 208kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=8+10) RefreshRange/mixed-case/refresh_window=[0.00,75.00]-10 208kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=10+9) RefreshRange/mixed-case/refresh_window=[0.00,50.00]-10 207kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[0.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,50.00]-10 207kB ± 0% 8kB ± 0% -96.03% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,75.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=9+10) RefreshRange/random-keys/refresh_window=[75.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[75.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,75.00]-10 207kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,75.00]-10 207kB ± 0% 8kB ± 0% -96.02% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[0.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.01% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[50.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.01% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[75.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.01% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.01% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[50.00,99.00]-10 208kB ± 0% 8kB ± 0% -96.01% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[75.00,95.00]-10 208kB ± 0% 8kB ± 0% -96.01% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,0.00]-10 29.3kB ± 4% 1.2kB ± 0% -95.95% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[0.00,0.00]-10 9.07kB ±25% 1.13kB ± 0% -87.51% (p=0.000 n=10+10) name old allocs/op new allocs/op delta RefreshRange/linear-keys/refresh_window=[99.00,99.00]-10 18.8k ± 0% 0.0k ± 0% -99.91% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[95.00,95.00]-10 18.1k ± 0% 0.0k ± 0% -99.91% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[95.00,95.00]-10 17.5k ± 0% 0.0k ± 0% -99.90% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[75.00,75.00]-10 14.4k ± 0% 0.0k ± 0% -99.88% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[50.00,50.00]-10 9.81k ± 0% 0.02k ± 0% -99.83% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[95.00,99.00]-10 18.2k ± 0% 0.1k ± 0% -99.69% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[75.00,95.00]-10 14.4k ± 0% 0.1k ± 0% -99.61% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[75.00,99.00]-10 14.4k ± 0% 0.1k ± 0% -99.61% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[99.00,99.00]-10 18.2k ± 0% 0.1k ± 2% -99.51% (p=0.000 n=10+8) RefreshRange/linear-keys/refresh_window=[50.00,95.00]-10 9.62k ± 0% 0.06k ± 0% -99.40% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[50.00,75.00]-10 9.62k ± 0% 0.06k ± 0% -99.40% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[50.00,99.00]-10 9.62k ± 0% 0.06k ± 0% -99.40% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[75.00,75.00]-10 13.8k ± 0% 0.2k ±28% -98.85% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[50.00,50.00]-10 9.25k ± 0% 0.17k ± 6% -98.11% (p=0.000 n=10+9) RefreshRange/random-keys/refresh_window=[75.00,75.00]-10 14.2k ± 0% 0.3k ± 8% -97.89% (p=0.000 n=10+9) RefreshRange/linear-keys/refresh_window=[0.00,0.00]-10 535 ± 3% 17 ± 0% -96.82% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[99.00,99.00]-10 18.6k ± 0% 0.8k ± 3% -95.56% (p=0.000 n=10+8) RefreshRange/random-keys/refresh_window=[95.00,95.00]-10 17.9k ± 0% 0.8k ± 2% -95.34% (p=0.000 n=10+8) RefreshRange/random-keys/refresh_window=[0.00,0.00]-10 351 ± 3% 18 ± 0% -94.88% (p=0.000 n=9+10) RefreshRange/random-keys/refresh_window=[50.00,50.00]-10 9.59k ± 0% 0.95k ±12% -90.05% (p=0.000 n=9+10) RefreshRange/mixed-case/refresh_window=[0.00,0.00]-10 73.7 ±10% 17.0 ± 0% -76.93% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,50.00]-10 80.0 ± 0% 56.0 ± 0% -30.00% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,50.00]-10 79.0 ± 0% 56.0 ± 0% -29.11% (p=0.000 n=9+10) RefreshRange/random-keys/refresh_window=[95.00,99.00]-10 79.0 ± 0% 56.0 ± 0% -29.11% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,50.00]-10 79.0 ± 0% 56.0 ± 0% -29.11% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,75.00]-10 79.0 ± 0% 56.0 ± 0% -29.11% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[50.00,75.00]-10 79.0 ± 0% 56.0 ± 0% -29.11% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[95.00,99.00]-10 79.0 ± 0% 56.0 ± 0% -29.11% (p=0.000 n=10+10) RefreshRange/linear-keys/refresh_window=[0.00,75.00]-10 80.0 ± 0% 58.0 ± 0% -27.50% (p=0.000 n=9+10) RefreshRange/linear-keys/refresh_window=[0.00,95.00]-10 80.0 ± 0% 58.0 ± 0% -27.50% (p=0.002 n=8+10) RefreshRange/linear-keys/refresh_window=[0.00,99.00]-10 79.7 ± 1% 58.0 ± 0% -27.23% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,75.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,95.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[0.00,99.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,75.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,95.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[50.00,99.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[75.00,95.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/random-keys/refresh_window=[75.00,99.00]-10 79.0 ± 0% 58.0 ± 0% -26.58% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,95.00]-10 79.0 ± 0% 60.0 ± 0% -24.05% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[0.00,99.00]-10 79.0 ± 0% 60.0 ± 0% -24.05% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[50.00,95.00]-10 79.0 ± 0% 60.0 ± 0% -24.05% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[50.00,99.00]-10 79.0 ± 0% 60.0 ± 0% -24.05% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[75.00,95.00]-10 79.0 ± 0% 60.0 ± 0% -24.05% (p=0.000 n=10+10) RefreshRange/mixed-case/refresh_window=[75.00,99.00]-10 79.0 ± 0% 60.0 ± 0% -24.05% (p=0.000 n=10+10) ``` ---- Release note (performance improvement): transaction read refresh operations performed during optimistic concurrency control's validation phase now use a time-bound file filter when scanning the LSM tree. This allows these operations to avoid scanning files that contain no keys written since the transaction originally performed its reads. --- pkg/kv/kvserver/batcheval/BUILD.bazel | 5 + .../kvserver/batcheval/cmd_refresh_range.go | 109 +++++-- .../batcheval/cmd_refresh_range_bench_test.go | 300 ++++++++++++++++++ .../batcheval/cmd_refresh_range_test.go | 10 + pkg/storage/mvcc_incremental_iterator.go | 10 +- 5 files changed, 393 insertions(+), 41 deletions(-) create mode 100644 pkg/kv/kvserver/batcheval/cmd_refresh_range_bench_test.go diff --git a/pkg/kv/kvserver/batcheval/BUILD.bazel b/pkg/kv/kvserver/batcheval/BUILD.bazel index 093634e70587..3e754b703483 100644 --- a/pkg/kv/kvserver/batcheval/BUILD.bazel +++ b/pkg/kv/kvserver/batcheval/BUILD.bazel @@ -104,6 +104,7 @@ go_test( "cmd_lease_test.go", "cmd_query_resolved_timestamp_test.go", "cmd_recover_txn_test.go", + "cmd_refresh_range_bench_test.go", "cmd_refresh_range_test.go", "cmd_refresh_test.go", "cmd_resolve_intent_test.go", @@ -140,6 +141,7 @@ go_test( "//pkg/testutils/skip", "//pkg/testutils/sqlutils", "//pkg/testutils/testcluster", + "//pkg/util/encoding", "//pkg/util/hlc", "//pkg/util/leaktest", "//pkg/util/log", @@ -149,6 +151,9 @@ go_test( "//pkg/util/uint128", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_errors//oserror", + "@com_github_cockroachdb_pebble//:pebble", + "@com_github_cockroachdb_pebble//vfs", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", ], diff --git a/pkg/kv/kvserver/batcheval/cmd_refresh_range.go b/pkg/kv/kvserver/batcheval/cmd_refresh_range.go index 0f026bd95adc..8a86c964dc28 100644 --- a/pkg/kv/kvserver/batcheval/cmd_refresh_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_refresh_range.go @@ -15,11 +15,25 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" ) +// refreshRangeTBIEnabled controls whether we use a TBI during ranged refreshes. +var refreshRangeTBIEnabled = settings.RegisterBoolSetting( + settings.SystemOnly, + "kv.refresh_range.time_bound_iterators.enabled", + "use time-bound iterators when performing ranged transaction refreshes", + util.ConstantWithMetamorphicTestBool("kv.refresh_range.time_bound_iterators_enabled", true), +) + func init() { RegisterReadOnlyCommand(roachpb.RefreshRange, DefaultDeclareKeys, RefreshRange) } @@ -50,40 +64,71 @@ func RefreshRange( return result.Result{}, errors.AssertionFailedf("empty RefreshFrom: %s", args) } - // Iterate over values until we discover any value written after the - // original timestamp, but before or at the current timestamp. Note that we - // iterate inconsistently, meaning that intents - including our own - are - // collected separately and the callback is only invoked on the latest - // committed version. Note also that we include tombstones, which must be - // considered as updates on refresh. log.VEventf(ctx, 2, "refresh %s @[%s-%s]", args.Span(), refreshFrom, refreshTo) - intents, err := storage.MVCCIterate( - ctx, reader, args.Key, args.EndKey, refreshTo, - storage.MVCCScanOptions{ - Inconsistent: true, - Tombstones: true, - }, - func(kv roachpb.KeyValue) error { - if ts := kv.Value.Timestamp; refreshFrom.Less(ts) { - return roachpb.NewRefreshFailedError(roachpb.RefreshFailedError_REASON_COMMITTED_VALUE, kv.Key, ts) - } - return nil - }) - if err != nil { - return result.Result{}, err - } + tbi := refreshRangeTBIEnabled.Get(&cArgs.EvalCtx.ClusterSettings().SV) + return result.Result{}, refreshRange(reader, tbi, args.Span(), refreshFrom, refreshTo, h.Txn.ID) +} + +// refreshRange iterates over the specified key span until it discovers a value +// written after the refreshFrom timestamp but before or at the refreshTo +// timestamp. The iteration observes MVCC tombstones, which must be considered +// as conflicts during a refresh. The iteration also observes intents, and any +// intent that is not owned by the specified txn ID is considered a conflict. +// +// If such a conflict is found, the function returns an error. Otherwise, no +// error is returned. +func refreshRange( + reader storage.Reader, + timeBoundIterator bool, + span roachpb.Span, + refreshFrom, refreshTo hlc.Timestamp, + txnID uuid.UUID, +) error { + // Construct an incremental iterator with the desired time bounds. Incremental + // iterators will emit MVCC tombstones by default and will emit intents when + // configured to do so (see IntentPolicy). + iter := storage.NewMVCCIncrementalIterator(reader, storage.MVCCIncrementalIterOptions{ + EnableTimeBoundIteratorOptimization: timeBoundIterator, + EndKey: span.EndKey, + StartTime: refreshFrom, // exclusive + EndTime: refreshTo, // inclusive + IntentPolicy: storage.MVCCIncrementalIterIntentPolicyEmit, + }) + defer iter.Close() - // Check if any intents which are not owned by this transaction were written - // at or beneath the refresh timestamp. - for _, i := range intents { - // Ignore our own intents. - if i.Txn.ID == h.Txn.ID { - continue + var meta enginepb.MVCCMetadata + iter.SeekGE(storage.MakeMVCCMetadataKey(span.Key)) + for { + if ok, err := iter.Valid(); err != nil { + return err + } else if !ok { + break + } + + key := iter.Key() + if !key.IsValue() { + // Found an intent. Check whether it is owned by this transaction. + // If so, proceed with iteration. Otherwise, return an error. + if err := protoutil.Unmarshal(iter.UnsafeValue(), &meta); err != nil { + return errors.Wrapf(err, "unmarshaling mvcc meta: %v", key) + } + if meta.Txn.ID == txnID { + // Ignore the transaction's own intent and skip past the corresponding + // provisional key-value. To do this, scan to the timestamp immediately + // before (i.e. the key immediately after) the provisional key. + iter.SeekGE(storage.MVCCKey{ + Key: key.Key, + Timestamp: meta.Timestamp.ToTimestamp().Prev(), + }) + continue + } + return roachpb.NewRefreshFailedError(roachpb.RefreshFailedError_REASON_INTENT, + key.Key, meta.Txn.WriteTimestamp) } - // Return an error if an intent was written to the span. - return result.Result{}, roachpb.NewRefreshFailedError(roachpb.RefreshFailedError_REASON_INTENT, - i.Key, i.Txn.WriteTimestamp) - } - return result.Result{}, nil + // If a committed value is found, return an error. + return roachpb.NewRefreshFailedError(roachpb.RefreshFailedError_REASON_COMMITTED_VALUE, + key.Key, key.Timestamp) + } + return nil } diff --git a/pkg/kv/kvserver/batcheval/cmd_refresh_range_bench_test.go b/pkg/kv/kvserver/batcheval/cmd_refresh_range_bench_test.go new file mode 100644 index 000000000000..43c0ca0f3984 --- /dev/null +++ b/pkg/kv/kvserver/batcheval/cmd_refresh_range_bench_test.go @@ -0,0 +1,300 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package batcheval_test + +import ( + "context" + "fmt" + "math/rand" + "os" + "path/filepath" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/util/encoding" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble" + "github.com/cockroachdb/pebble/vfs" + "github.com/stretchr/testify/require" +) + +// BenchmarkRefreshRange benchmarks ranged refresh requests with different LSM +// shapes and refresh windows. It was heavily adapted from BenchmarkCatchUpScan, +// which was itself heavily adapted from code in pkg/storage. +func BenchmarkRefreshRange(b *testing.B) { + defer log.Scope(b).Close(b) + + numKeys := 1_000_000 + valueBytes := 64 + + dataOpts := map[string]benchDataOptions{ + // linear-keys is one of our best-case scenarios. In + // this case, each newly written row is at a key + // following the previously written row and at a later + // timestamp. Further, once compacted, all of the SSTs + // should be in L5 and L6. As a result, the time-based + // optimization can exclude SSTs fairly easily. + "linear-keys": { + numKeys: numKeys, + valueBytes: valueBytes, + }, + // random-keys is our worst case. We write keys in + // random order but with timestamps that keep marching + // forward. Once compacted, most of the data is in L5 + // and L6. So, we have very few overlapping SSTs and + // most SSTs in our lower level will have at least 1 + // key that needs to be included in our scan, despite + // the time based optimization. + "random-keys": { + randomKeyOrder: true, + numKeys: numKeys, + valueBytes: valueBytes, + }, + // mixed-case is a middling case. + // + // This case is trying to simulate a larger store, but + // with fewer bytes. If we did not reduce + // LBaseMaxBytes, almost all data would be in Lbase or + // L6, and TBI would be ineffective. By reducing + // LBaseMaxBytes, the data should spread out over more + // levels, like in a real store. The LSM state + // depicted below shows that this was only partially + // successful. + // + // We return a read only engine to prevent read-based + // compactions after the initial data generation. + "mixed-case": { + randomKeyOrder: true, + numKeys: numKeys, + valueBytes: valueBytes, + readOnlyEngine: true, + lBaseMaxBytes: 256, + }, + } + + for name, do := range dataOpts { + b.Run(name, func(b *testing.B) { + tsPercents := []float64{0.0, 0.50, 0.75, 0.95, 0.99} + for _, refreshFrom := range tsPercents { + for _, refreshTo := range tsPercents { + if refreshTo < refreshFrom { + continue + } + name := fmt.Sprintf("refresh_window=[%2.2f,%2.2f]", refreshFrom*100, refreshTo*100) + b.Run(name, func(b *testing.B) { + tsForPercent := func(p float64) hlc.Timestamp { + walltime := int64(5 * (float64(numKeys)*p + 1)) // see setupData + return hlc.Timestamp{WallTime: walltime} + } + runRefreshRangeBenchmark(b, setupMVCCPebble, benchOptions{ + refreshFrom: tsForPercent(refreshFrom), // exclusive + refreshTo: tsForPercent(refreshTo).Next(), // inclusive + dataOpts: do, + }) + }) + } + } + }) + } +} + +func runRefreshRangeBenchmark(b *testing.B, emk engineMaker, opts benchOptions) { + ctx := context.Background() + eng, _ := setupData(ctx, b, emk, opts.dataOpts) + defer eng.Close() + st := cluster.MakeTestingClusterSettings() + evalCtx := (&batcheval.MockEvalCtx{ClusterSettings: st}).EvalContext() + startKey := roachpb.Key(encoding.EncodeUvarintAscending([]byte("key-"), uint64(0))) + endKey := roachpb.Key(encoding.EncodeUvarintAscending([]byte("key-"), uint64(opts.dataOpts.numKeys))) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + func() { + var resp roachpb.RefreshRangeResponse + _, err := batcheval.RefreshRange(ctx, eng, batcheval.CommandArgs{ + EvalCtx: evalCtx, + Args: &roachpb.RefreshRangeRequest{ + RequestHeader: roachpb.RequestHeader{ + Key: startKey, + EndKey: endKey, + }, + RefreshFrom: opts.refreshFrom, + }, + Header: roachpb.Header{ + Txn: &roachpb.Transaction{ + TxnMeta: enginepb.TxnMeta{ + WriteTimestamp: opts.refreshTo, + }, + ReadTimestamp: opts.refreshTo, + }, + Timestamp: opts.refreshTo, + }, + }, &resp) + + // If the refresh window was empty, we expect the refresh to scan the + // entire span and succeed. Otherwise, it will short-circuit as soon + // as it hits a conflict and return an error. + emptyWindow := opts.refreshTo.Equal(opts.refreshFrom.Next()) + if emptyWindow { + require.NoError(b, err) + } else { + require.Error(b, err) + require.Regexp(b, "encountered recently written committed value", err) + } + }() + } +} + +type benchDataOptions struct { + numKeys int + valueBytes int + randomKeyOrder bool + readOnlyEngine bool + lBaseMaxBytes int64 +} + +type benchOptions struct { + refreshFrom hlc.Timestamp + refreshTo hlc.Timestamp + dataOpts benchDataOptions +} + +type engineMaker func(testing.TB, string, int64, bool) storage.Engine + +func setupMVCCPebble(b testing.TB, dir string, lBaseMaxBytes int64, readOnly bool) storage.Engine { + opts := storage.DefaultPebbleOptions() + opts.FS = vfs.Default + opts.LBaseMaxBytes = lBaseMaxBytes + opts.ReadOnly = readOnly + opts.FormatMajorVersion = pebble.FormatBlockPropertyCollector + peb, err := storage.NewPebble( + context.Background(), + storage.PebbleConfig{ + StorageConfig: base.StorageConfig{Dir: dir, Settings: cluster.MakeTestingClusterSettings()}, + Opts: opts, + }) + if err != nil { + b.Fatalf("could not create new pebble instance at %s: %+v", dir, err) + } + return peb +} + +// setupData data writes numKeys keys. One version of each key +// is written. The write timestamp starts at 5ns and then in 5ns +// increments. This allows scans at various times, starting at t=5ns, +// and continuing to t=5ns*(numKeys+1). The goal of this is to +// approximate an append-only type workload. +// +// A read-only engine can be returned if opts.readOnlyEngine is +// set. The goal of this is to prevent read-triggered compactions that +// might change the distribution of data across levels. +// +// The creation of the database is time consuming, especially for +// larger numbers of versions. The database is persisted between runs +// and stored in the current directory. +func setupData( + ctx context.Context, b *testing.B, emk engineMaker, opts benchDataOptions, +) (storage.Engine, string) { + orderStr := "linear" + if opts.randomKeyOrder { + orderStr = "random" + } + readOnlyStr := "" + if opts.readOnlyEngine { + readOnlyStr = "_readonly" + } + loc := fmt.Sprintf("refresh_range_bench_data_%s%s_%d_%d_%d", + orderStr, readOnlyStr, opts.numKeys, opts.valueBytes, opts.lBaseMaxBytes) + exists := true + if _, err := os.Stat(loc); oserror.IsNotExist(err) { + exists = false + } else if err != nil { + b.Fatal(err) + } + + if exists { + testutils.ReadAllFiles(filepath.Join(loc, "*")) + return emk(b, loc, opts.lBaseMaxBytes, opts.readOnlyEngine), loc + } + + eng := emk(b, loc, opts.lBaseMaxBytes, false) + log.Infof(ctx, "creating refresh range benchmark data: %s", loc) + + // Generate the same data every time. + rng := rand.New(rand.NewSource(1449168817)) + + keys := make([]roachpb.Key, opts.numKeys) + order := make([]int, 0, opts.numKeys) + for i := 0; i < opts.numKeys; i++ { + keys[i] = encoding.EncodeUvarintAscending([]byte("key-"), uint64(i)) + order = append(order, i) + } + + if opts.randomKeyOrder { + rng.Shuffle(len(order), func(i, j int) { + order[i], order[j] = order[j], order[i] + }) + } + + writeKey := func(batch storage.Batch, idx int, pos int) { + key := keys[idx] + value := roachpb.MakeValueFromBytes(randutil.RandBytes(rng, opts.valueBytes)) + value.InitChecksum(key) + ts := hlc.Timestamp{WallTime: int64((pos + 1) * 5)} + if err := storage.MVCCPut(ctx, batch, nil /* ms */, key, ts, value, nil); err != nil { + b.Fatal(err) + } + } + + batch := eng.NewBatch() + for i, idx := range order { + // Output the keys in ~20 batches. If we used a single batch to output all + // of the keys rocksdb would create a single sstable. We want multiple + // sstables in order to exercise filtering of which sstables are examined + // during iterator seeking. We fix the number of batches we output so that + // optimizations which change the data size result in the same number of + // sstables. + if scaled := len(order) / 20; i > 0 && (i%scaled) == 0 { + log.Infof(ctx, "committing (%d/~%d) (%d/%d)", i/scaled, 20, i, len(order)) + if err := batch.Commit(false /* sync */); err != nil { + b.Fatal(err) + } + batch.Close() + batch = eng.NewBatch() + if err := eng.Flush(); err != nil { + b.Fatal(err) + } + } + writeKey(batch, idx, i) + } + if err := batch.Commit(false /* sync */); err != nil { + b.Fatal(err) + } + batch.Close() + if err := eng.Flush(); err != nil { + b.Fatal(err) + } + + if opts.readOnlyEngine { + eng.Close() + eng = emk(b, loc, opts.lBaseMaxBytes, opts.readOnlyEngine) + } + return eng, loc +} diff --git a/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go b/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go index f8bdfd27b536..535a45194923 100644 --- a/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_refresh_range_test.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -121,6 +122,9 @@ func TestRefreshRangeTimeBoundIterator(t *testing.T) { // resulting in an error from RefreshRange. var resp roachpb.RefreshRangeResponse _, err := RefreshRange(ctx, db, CommandArgs{ + EvalCtx: (&MockEvalCtx{ + ClusterSettings: cluster.MakeTestingClusterSettings(), + }).EvalContext(), Args: &roachpb.RefreshRangeRequest{ RequestHeader: roachpb.RequestHeader{ Key: k, @@ -193,6 +197,9 @@ func TestRefreshRangeError(t *testing.T) { // 2, therefore the refresh should fail. var resp roachpb.RefreshRangeResponse _, err := RefreshRange(ctx, db, CommandArgs{ + EvalCtx: (&MockEvalCtx{ + ClusterSettings: cluster.MakeTestingClusterSettings(), + }).EvalContext(), Args: &roachpb.RefreshRangeRequest{ RequestHeader: roachpb.RequestHeader{ Key: k, @@ -254,6 +261,9 @@ func TestRefreshRangeTimestampBounds(t *testing.T) { } { var resp roachpb.RefreshRangeResponse _, err := RefreshRange(ctx, db, CommandArgs{ + EvalCtx: (&MockEvalCtx{ + ClusterSettings: cluster.MakeTestingClusterSettings(), + }).EvalContext(), Args: &roachpb.RefreshRangeRequest{ RequestHeader: roachpb.RequestHeader{ Key: k, diff --git a/pkg/storage/mvcc_incremental_iterator.go b/pkg/storage/mvcc_incremental_iterator.go index 8dd6091d6033..0beb1b464ed5 100644 --- a/pkg/storage/mvcc_incremental_iterator.go +++ b/pkg/storage/mvcc_incremental_iterator.go @@ -52,9 +52,6 @@ import ( // if !ok { ... } // [code using iter.Key() and iter.Value()] // } -// if err := iter.Error(); err != nil { -// ... -// } // // Note regarding the correctness of the time-bound iterator optimization: // @@ -145,9 +142,7 @@ type MVCCIncrementalIterOptions struct { EnableTimeBoundIteratorOptimization bool EndKey roachpb.Key // Keys visible by the MVCCIncrementalIterator must be within (StartTime, - // EndTime]. Note that if {Min,Max}TimestampHints are specified in - // IterOptions, the timestamp hints interval should include the start and end - // time. + // EndTime]. StartTime hlc.Timestamp EndTime hlc.Timestamp @@ -158,9 +153,6 @@ type MVCCIncrementalIterOptions struct { // NewMVCCIncrementalIterator creates an MVCCIncrementalIterator with the // specified reader and options. The timestamp hint range should not be more // restrictive than the start and end time range. -// TODO(pbardea): Add validation here and in C++ implementation that the -// timestamp hints are not more restrictive than incremental iterator's -// (startTime, endTime] interval. func NewMVCCIncrementalIterator( reader Reader, opts MVCCIncrementalIterOptions, ) *MVCCIncrementalIterator {