From 5ae7a8d150e1904c233af5ff9ddf2911facb8ad5 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Thu, 1 Jun 2023 15:27:29 -0700 Subject: [PATCH] `CompactRange()` always compacts to bottommost level for leveled compaction (#11468) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: currently for leveled compaction, the max output level of a call to `CompactRange()` is pre-computed before compacting each level. This max output level is the max level whose key range overlaps with the manual compaction key range. However, during manual compaction, files in the max output level may be compacted down further by some background compaction. When this background compaction is a trivial move, there is a race condition and the manual compaction may not be able to compact all keys in the specified key range. This PR updates `CompactRange()` to always compact to the bottommost level to make this race condition more unlikely (it can still happen, see more in comment here: https://github.com/cbi42/rocksdb/blob/796f58f42ad1bdbf49e5fcf480763f11583b790e/db/db_impl/db_impl_compaction_flush.cc#L1180C29-L1184). This PR also changes the behavior of CompactRange() when `bottommost_level_compaction=kIfHaveCompactionFilter` (the default option). The old behavior is that, if a compaction filter is provided, CompactRange() always does an intra-level compaction at the final output level for all files in the manual compaction key range. The only exception when `first_overlapped_level = 0` and `max_overlapped_level = 0`. It’s awkward to maintain the same behavior after this PR since we do not compute max_overlapped_level anymore. So the new behavior is similar to kForceOptimized: always does intra-level compaction at the bottommost level, but not including new files generated during this manual compaction. Several unit tests are updated to work with this new manual compaction behavior. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11468 Test Plan: Add new unit tests `DBCompactionTest.ManualCompactionCompactAllKeysInRange*` Reviewed By: ajkr Differential Revision: D46079619 Pulled By: cbi42 fbshipit-source-id: 19d844ba4ec8dc1a0b8af5d2f36ff15820c6e76f Signed-off-by: tabokie --- HISTORY.md | 2 + db/compaction/compaction_picker.cc | 6 ++- db/db_bloom_filter_test.cc | 5 +- db/db_compaction_test.cc | 24 +++------ db/db_impl/db_impl_compaction_flush.cc | 49 ++++++++++++------- db/db_test2.cc | 11 +++-- db/manual_compaction_test.cc | 6 +-- include/rocksdb/options.h | 13 +++-- .../compact_range_to_bottom.md | 2 + 9 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 unreleased_history/behavior_changes/compact_range_to_bottom.md diff --git a/HISTORY.md b/HISTORY.md index e0cc8b78951..a93b137baf5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -23,6 +23,8 @@ * For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330). * DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516). * If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375). +* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) ## 6.29.5 (03/29/2022) ### Bug Fixes diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 42d7f8b2cc8..fe9bd096e95 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -717,8 +717,10 @@ Compaction* CompactionPicker::CompactRange( // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out // files that are created during the current compaction. - if (compact_range_options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized && + if ((compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter) && max_file_num_to_ignore != port::kMaxUint64) { assert(input_level == output_level); // inputs_shrunk holds a continuous subset of input files which were all diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 5b444d841af..c3260785db4 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -2225,10 +2225,9 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { BottommostLevelCompaction::kSkip; compact_options.change_level = true; compact_options.target_level = 7; - ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) - .IsNotSupported()); + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); - ASSERT_EQ(trivial_move, 1); + ASSERT_GE(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 387addaa6f4..fb90eba90be 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -5974,26 +5974,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { auto start_idx = key_idx; GenerateNewFile(&rnd, &key_idx); GenerateNewFile(&rnd, &key_idx); - auto end_idx = key_idx - 1; ASSERT_EQ("1,1,2", FilesPerLevel(0)); - // Next two CompactRange() calls are used to test exercise error paths within - // RefitLevel() before triggering a valid RefitLevel() call - - // Trigger a refit to L1 first - { - std::string begin_string = Key(start_idx); - std::string end_string = Key(end_idx); - Slice begin(begin_string); - Slice end(end_string); - - CompactRangeOptions cro; - cro.change_level = true; - cro.target_level = 1; - ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); - } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); + // The next CompactRange() call is used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + // // Try a refit from L2->L1 - this should fail and exercise error paths in // RefitLevel() { @@ -6008,7 +5996,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { cro.target_level = 1; ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); // Try a valid Refit request to ensure, the path is still working { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index b8fedbfeb2a..c371d260cd4 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1085,7 +1085,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, false /* disable_trivial_move */, port::kMaxUint64); } else { int first_overlapped_level = kInvalidLevel; - int max_overlapped_level = kInvalidLevel; { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; @@ -1108,10 +1107,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, begin, end); } if (overlap) { - if (first_overlapped_level == kInvalidLevel) { - first_overlapped_level = level; - } - max_overlapped_level = level; + first_overlapped_level = level; + break; } } CleanupSuperVersion(super_version); @@ -1124,7 +1121,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, cfd, first_overlapped_level, first_overlapped_level, options, begin, end, exclusive, true /* disallow_trivial_move */, std::numeric_limits::max() /* max_file_num_to_ignore */); - final_output_level = max_overlapped_level; + final_output_level = first_overlapped_level; } else { assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); uint64_t next_file_number = versions_->current_next_file_number(); @@ -1135,8 +1132,30 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, // at L1 (or LBase), if applicable. int level = first_overlapped_level; final_output_level = level; - int output_level, base_level; - while (level < max_overlapped_level || level == 0) { + int output_level = 0, base_level = 0; + for (;;) { + // Always allow L0 -> L1 compaction + if (level > 0) { + if (cfd->ioptions()->level_compaction_dynamic_level_bytes) { + assert(final_output_level < cfd->ioptions()->num_levels); + if (final_output_level + 1 == cfd->ioptions()->num_levels) { + break; + } + } else { + // TODO(cbi): there is still a race condition here where + // if a background compaction compacts some file beyond + // current()->storage_info()->num_non_empty_levels() right after + // the check here.This should happen very infrequently and should + // not happen once a user populates the last level of the LSM. + InstrumentedMutexLock l(&mutex_); + // num_non_empty_levels may be lower after a compaction, so + // we check for >= here. + if (final_output_level + 1 >= + cfd->current()->storage_info()->num_non_empty_levels()) { + break; + } + } + } output_level = level + 1; if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { @@ -1167,17 +1186,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, if (s.ok()) { assert(final_output_level > 0); // bottommost level intra-level compaction - // TODO(cbi): this preserves earlier behavior where if - // max_overlapped_level = 0 and bottommost_level_compaction is - // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction - // and do not do intra-LBase compaction even when user configures - // compaction filter. We may want to still do a LBase -> LBase - // compaction in case there is some file in LBase that did not go - // through L0 -> LBase compaction, and hence did not go through - // compaction filter. if ((options.bottommost_level_compaction == BottommostLevelCompaction::kIfHaveCompactionFilter && - max_overlapped_level != 0 && (cfd->ioptions()->compaction_filter != nullptr || cfd->ioptions()->compaction_filter_factory != nullptr)) || options.bottommost_level_compaction == @@ -1185,10 +1195,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, options.bottommost_level_compaction == BottommostLevelCompaction::kForce) { // Use `next_file_number` as `max_file_num_to_ignore` to avoid - // rewriting newly compacted files when it is kForceOptimized. + // rewriting newly compacted files when it is kForceOptimized + // or kIfHaveCompactionFilter with compaction filter set. s = RunManualCompaction( cfd, final_output_level, final_output_level, options, begin, - end, exclusive, false /* disallow_trivial_move */, + end, exclusive, true /* disallow_trivial_move */, next_file_number /* max_file_num_to_ignore */); } } diff --git a/db/db_test2.cc b/db/db_test2.cc index 2f91be2f9cd..9dc0b4282c2 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2206,6 +2206,7 @@ class PinL0IndexAndFilterBlocksTest ASSERT_OK(Flush(1)); // move this table to L1 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(1, NumTableFilesAtLevel(1, 1)); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); @@ -2363,7 +2364,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { // this should be read from L1 value = Get(1, "a"); if (!disallow_preload_) { - // In inifinite max files case, there's a cache miss in executing Get() + // In infinite max files case, there's a cache miss in executing Get() // because index and filter are not prefetched before. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); @@ -2391,12 +2392,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. @@ -2405,12 +2406,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 2eb4812c71e..60aa389022e 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -288,9 +288,9 @@ TEST_F(ManualCompactionTest, SkipLevel) { filter->Reset(); ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(4, filter->NumKeys()); - // 1 is first compacted to L1 and then further compacted into [2, 4, 8], - // so finally the logged level for 1 is L1. - ASSERT_EQ(1, filter->KeyLevel("1")); + // 1 is first compacted from L0 to L1, and then L1 intra level compaction + // compacts [2, 4, 8] only. + ASSERT_EQ(0, filter->KeyLevel("1")); ASSERT_EQ(1, filter->KeyLevel("2")); ASSERT_EQ(1, filter->KeyLevel("4")); ASSERT_EQ(1, filter->KeyLevel("8")); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ee8d89a3b3a..c489412d8bd 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1763,15 +1763,18 @@ struct CompactionOptions { // For level based compaction, we can configure if we want to skip/force // bottommost level compaction. enum class BottommostLevelCompaction { - // Skip bottommost level compaction + // Skip bottommost level compaction. kSkip, - // Only compact bottommost level if there is a compaction filter - // This is the default option + // Only compact bottommost level if there is a compaction filter. + // This is the default option. + // Similar to kForceOptimized, when compacting bottommost level, avoid + // double-compacting files + // created in the same manual compaction. kIfHaveCompactionFilter, - // Always compact bottommost level + // Always compact bottommost level. kForce, // Always compact bottommost level but in bottommost level avoid - // double-compacting files created in the same compaction + // double-compacting files created in the same compaction. kForceOptimized, }; diff --git a/unreleased_history/behavior_changes/compact_range_to_bottom.md b/unreleased_history/behavior_changes/compact_range_to_bottom.md new file mode 100644 index 00000000000..480caed1ed5 --- /dev/null +++ b/unreleased_history/behavior_changes/compact_range_to_bottom.md @@ -0,0 +1,2 @@ +For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) \ No newline at end of file