From 2fc9ec11533784ce074c64a4270d877fb44b1705 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Thu, 20 Apr 2023 11:10:48 -0700 Subject: [PATCH 1/3] Always allow L0->L1 trivial move during manual compaction (#11375) Summary: during manual compaction (CompactRange()), L0->L1 trivial move is disabled when only L0 overlaps with compacting key range (introduced in https://github.com/facebook/rocksdb/issues/7368 to enforce kForce* contract). This can cause large memory usage due to compaction readahead when number of L0 files is large. This PR allows L0->L1 trivial move in this case, and will do a L1 -> L1 intra-level compaction when needed (`bottommost_level_compaction` is kForce*). In brief, consider a DB with only L0 file, and user calls CompactRange(kForce, nullptr, nullptr), - before this PR, RocksDB does a L0 -> L1 compaction (disallow trivial move), - after this PR, RocksDB does a L0 -> L1 compaction (allow trivial move), and a L1 -> L1 compaction. Users can use kForceOptimized to avoid this extra L1->L1 compaction overhead when L0s are overlapping and cannot be trivial moved. This PR also fixed a bug (see previous discussion in https://github.com/facebook/rocksdb/issues/11041) where `final_output_level` of a manual compaction can be miscalculated when `level_compaction_dynamic_level_bytes=true`. This bug could cause incorrect level being moved when CompactRangeOptions::change_level is specified. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11375 Test Plan: - Added new unit tests to test that L0 -> L1 compaction allows trivial move and L1 -> L1 compaction is done when needed. Reviewed By: ajkr Differential Revision: D44943518 Pulled By: cbi42 fbshipit-source-id: e9fb770d17b163c18a623e1d1bd6b81159192708 Signed-off-by: tabokie --- HISTORY.md | 1 + db/db_compaction_filter_test.cc | 2 +- db/db_compaction_test.cc | 43 ++++++-- db/db_impl/db_impl.h | 6 +- db/db_impl/db_impl_compaction_flush.cc | 141 ++++++++++++++----------- db/db_sst_test.cc | 5 +- 6 files changed, 124 insertions(+), 74 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9750cc91036..e0cc8b78951 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,7 @@ ## Behavior Changes * For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330). * DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516). +* If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375). ## 6.29.5 (03/29/2022) ### Bug Fixes diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index abf8b6c8f30..0c4aa16f8cb 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -753,7 +753,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { } #ifndef ROCKSDB_LITE -// Compaction filters aplies to all records, regardless snapshots. +// Compaction filters applies to all records, regardless snapshots. TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { std::string five = ToString(5); Options options = CurrentOptions(); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 6f5c6b486f8..387addaa6f4 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -56,11 +56,12 @@ class DBCompactionTestWithParam class DBCompactionTestWithBottommostParam : public DBTestBase, - public testing::WithParamInterface { + public testing::WithParamInterface< + std::tuple> { public: DBCompactionTestWithBottommostParam() : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { - bottommost_level_compaction_ = GetParam(); + bottommost_level_compaction_ = std::get<0>(GetParam()); } BottommostLevelCompaction bottommost_level_compaction_; @@ -5625,6 +5626,9 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { constexpr int kSstNum = 10; Options options = CurrentOptions(); options.disable_auto_compactions = true; + options.num_levels = 7; + const bool dynamic_level = std::get<1>(GetParam()); + options.level_compaction_dynamic_level_bytes = dynamic_level; DestroyAndReopen(options); // Generate some sst files on level 0 with sequence keys (no overlap) @@ -5642,25 +5646,42 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { auto cro = CompactRangeOptions(); cro.bottommost_level_compaction = bottommost_level_compaction_; + bool trivial_moved = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_moved = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // All bottommost_level_compaction options should allow l0 -> l1 trivial move. ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(trivial_moved); if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || bottommost_level_compaction_ == BottommostLevelCompaction::kForceOptimized) { - // Real compaction to compact all sst files from level 0 to 1 file on level - // 1 - ASSERT_EQ("0,1", FilesPerLevel(0)); + // bottommost level should go through intra-level compaction + // and has only 1 file + if (dynamic_level) { + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel(0)); + } else { + ASSERT_EQ("0,1", FilesPerLevel(0)); + } } else { - // Just trivial move from level 0 -> 1 - ASSERT_EQ("0," + ToString(kSstNum), FilesPerLevel(0)); + // Just trivial move from level 0 -> 1/base + if (dynamic_level) { + ASSERT_EQ("0,0,0,0,0,0," + std::to_string(kSstNum), FilesPerLevel(0)); + } else { + ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + } } } INSTANTIATE_TEST_CASE_P( DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam, - ::testing::Values(BottommostLevelCompaction::kSkip, - BottommostLevelCompaction::kIfHaveCompactionFilter, - BottommostLevelCompaction::kForce, - BottommostLevelCompaction::kForceOptimized)); + ::testing::Combine( + ::testing::Values(BottommostLevelCompaction::kSkip, + BottommostLevelCompaction::kIfHaveCompactionFilter, + BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized), + ::testing::Bool())); TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { Options options = CurrentOptions(); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index f7d7e4b172a..509d0308665 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -670,12 +670,16 @@ class DBImpl : public DB { // max_file_num_to_ignore allows bottom level compaction to filter out newly // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will // disable the filtering + // If `final_output_level` is not nullptr, it is set to manual compaction's + // output level if returned status is OK, and it may or may not be set to + // manual compaction's output level if returned status is not OK. Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const Slice* begin, const Slice* end, bool exclusive, bool disallow_trivial_move, - uint64_t max_file_num_to_ignore); + uint64_t max_file_num_to_ignore, + int* final_output_level = nullptr); // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 9c476434657..b8fedbfeb2a 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1082,7 +1082,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, } s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, final_output_level, options, begin, end, exclusive, - false, port::kMaxUint64); + false /* disable_trivial_move */, port::kMaxUint64); } else { int first_overlapped_level = kInvalidLevel; int max_overlapped_level = kInvalidLevel; @@ -1117,70 +1117,81 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, CleanupSuperVersion(super_version); } if (s.ok() && first_overlapped_level != kInvalidLevel) { - // max_file_num_to_ignore can be used to filter out newly created SST - // files, useful for bottom level compaction in a manual compaction - uint64_t max_file_num_to_ignore = port::kMaxUint64; - uint64_t next_file_number = versions_->current_next_file_number(); - final_output_level = max_overlapped_level; - int output_level; - for (int level = first_overlapped_level; level <= max_overlapped_level; - level++) { - bool disallow_trivial_move = false; - // in case the compaction is universal or if we're compacting the - // bottom-most level, the output level will be the same as input one. - // level 0 can never be the bottommost level (i.e. if all files are in - // level 0, we will compact to level 1) - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - output_level = level; - } else if (level == max_overlapped_level && level > 0) { - if (options.bottommost_level_compaction == - BottommostLevelCompaction::kSkip) { - // Skip bottommost level compaction - continue; - } else if (options.bottommost_level_compaction == - BottommostLevelCompaction::kIfHaveCompactionFilter && - cfd->ioptions()->compaction_filter == nullptr && - cfd->ioptions()->compaction_filter_factory == nullptr) { - // Skip bottommost level compaction since we don't have a compaction - // filter - continue; - } - output_level = level; - // update max_file_num_to_ignore only for bottom level compaction - // because data in newly compacted files in middle levels may still - // need to be pushed down - max_file_num_to_ignore = next_file_number; - } else { + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + assert(first_overlapped_level == 0); + s = RunManualCompaction( + cfd, first_overlapped_level, first_overlapped_level, options, begin, + end, exclusive, true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */); + final_output_level = max_overlapped_level; + } else { + assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); + uint64_t next_file_number = versions_->current_next_file_number(); + // Start compaction from `first_overlapped_level`, one level down at a + // time, until output level >= max_overlapped_level. + // When max_overlapped_level == 0, we will still compact from L0 -> L1 + // (or LBase), and followed by a bottommost level intra-level compaction + // at L1 (or LBase), if applicable. + int level = first_overlapped_level; + final_output_level = level; + int output_level, base_level; + while (level < max_overlapped_level || level == 0) { output_level = level + 1; - if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && - cfd->ioptions()->level_compaction_dynamic_level_bytes && + if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { output_level = ColumnFamilyData::kCompactToBaseLevel; } - // if it's a BottommostLevel compaction and `kForce*` compaction is - // set, disallow trivial move - if (level == max_overlapped_level && - (options.bottommost_level_compaction == - BottommostLevelCompaction::kForce || - options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized)) { - disallow_trivial_move = true; + // Use max value for `max_file_num_to_ignore` to always compact + // files down. + s = RunManualCompaction( + cfd, level, output_level, options, begin, end, exclusive, + false /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + output_level == ColumnFamilyData::kCompactToBaseLevel + ? &base_level + : nullptr); + if (!s.ok()) { + break; } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(base_level > 0); + level = base_level; + } else { + ++level; + } + final_output_level = level; + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } - s = RunManualCompaction(cfd, level, output_level, options, begin, end, - exclusive, disallow_trivial_move, - max_file_num_to_ignore); - if (!s.ok()) { - break; - } - if (output_level == ColumnFamilyData::kCompactToBaseLevel) { - final_output_level = cfd->NumberLevels() - 1; - } else if (output_level > final_output_level) { - final_output_level = output_level; + if (s.ok()) { + assert(final_output_level > 0); + // bottommost level intra-level compaction + // TODO(cbi): this preserves earlier behavior where if + // max_overlapped_level = 0 and bottommost_level_compaction is + // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction + // and do not do intra-LBase compaction even when user configures + // compaction filter. We may want to still do a LBase -> LBase + // compaction in case there is some file in LBase that did not go + // through L0 -> LBase compaction, and hence did not go through + // compaction filter. + if ((options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + max_overlapped_level != 0 && + (cfd->ioptions()->compaction_filter != nullptr || + cfd->ioptions()->compaction_filter_factory != nullptr)) || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForce) { + // Use `next_file_number` as `max_file_num_to_ignore` to avoid + // rewriting newly compacted files when it is kForceOptimized. + s = RunManualCompaction( + cfd, final_output_level, final_output_level, options, begin, + end, exclusive, false /* disallow_trivial_move */, + next_file_number /* max_file_num_to_ignore */); + } } - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } } } @@ -1798,7 +1809,7 @@ Status DBImpl::RunManualCompaction( ColumnFamilyData* cfd, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const Slice* begin, const Slice* end, bool exclusive, bool disallow_trivial_move, - uint64_t max_file_num_to_ignore) { + uint64_t max_file_num_to_ignore, int* final_output_level) { assert(input_level == ColumnFamilyData::kCompactAllLevels || input_level >= 0); @@ -1942,6 +1953,15 @@ Status DBImpl::RunManualCompaction( } else if (!scheduled) { if (compaction == nullptr) { manual.done = true; + if (final_output_level) { + // No compaction needed or there is a conflicting compaction. + // Still set `final_output_level` to the level where we would + // have compacted to. + *final_output_level = output_level; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + *final_output_level = cfd->current()->storage_info()->base_level(); + } + } bg_cv_.SignalAll(); continue; } @@ -1975,6 +1995,9 @@ Status DBImpl::RunManualCompaction( } scheduled = true; TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled"); + if (final_output_level) { + *final_output_level = compaction->output_level(); + } } } diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index b036e1ef969..68d818ba02a 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -758,9 +758,10 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { // We created 4 sst files in L0 ASSERT_EQ("4", FilesPerLevel(0)); - // Compaction will move the 4 files in L0 to trash and create 1 L1 file + // Compaction will move the 4 files in L0 to trash and create 1 L1 file. + // Use kForceOptimized to not rewrite the new L1 file. CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ("0,1", FilesPerLevel(0)); From 5ae7a8d150e1904c233af5ff9ddf2911facb8ad5 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Thu, 1 Jun 2023 15:27:29 -0700 Subject: [PATCH 2/3] `CompactRange()` always compacts to bottommost level for leveled compaction (#11468) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: currently for leveled compaction, the max output level of a call to `CompactRange()` is pre-computed before compacting each level. This max output level is the max level whose key range overlaps with the manual compaction key range. However, during manual compaction, files in the max output level may be compacted down further by some background compaction. When this background compaction is a trivial move, there is a race condition and the manual compaction may not be able to compact all keys in the specified key range. This PR updates `CompactRange()` to always compact to the bottommost level to make this race condition more unlikely (it can still happen, see more in comment here: https://github.com/cbi42/rocksdb/blob/796f58f42ad1bdbf49e5fcf480763f11583b790e/db/db_impl/db_impl_compaction_flush.cc#L1180C29-L1184). This PR also changes the behavior of CompactRange() when `bottommost_level_compaction=kIfHaveCompactionFilter` (the default option). The old behavior is that, if a compaction filter is provided, CompactRange() always does an intra-level compaction at the final output level for all files in the manual compaction key range. The only exception when `first_overlapped_level = 0` and `max_overlapped_level = 0`. It’s awkward to maintain the same behavior after this PR since we do not compute max_overlapped_level anymore. So the new behavior is similar to kForceOptimized: always does intra-level compaction at the bottommost level, but not including new files generated during this manual compaction. Several unit tests are updated to work with this new manual compaction behavior. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11468 Test Plan: Add new unit tests `DBCompactionTest.ManualCompactionCompactAllKeysInRange*` Reviewed By: ajkr Differential Revision: D46079619 Pulled By: cbi42 fbshipit-source-id: 19d844ba4ec8dc1a0b8af5d2f36ff15820c6e76f Signed-off-by: tabokie --- HISTORY.md | 2 + db/compaction/compaction_picker.cc | 6 ++- db/db_bloom_filter_test.cc | 5 +- db/db_compaction_test.cc | 24 +++------ db/db_impl/db_impl_compaction_flush.cc | 49 ++++++++++++------- db/db_test2.cc | 11 +++-- db/manual_compaction_test.cc | 6 +-- include/rocksdb/options.h | 13 +++-- .../compact_range_to_bottom.md | 2 + 9 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 unreleased_history/behavior_changes/compact_range_to_bottom.md diff --git a/HISTORY.md b/HISTORY.md index e0cc8b78951..a93b137baf5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -23,6 +23,8 @@ * For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330). * DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516). * If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375). +* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) ## 6.29.5 (03/29/2022) ### Bug Fixes diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 42d7f8b2cc8..fe9bd096e95 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -717,8 +717,10 @@ Compaction* CompactionPicker::CompactRange( // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out // files that are created during the current compaction. - if (compact_range_options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized && + if ((compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter) && max_file_num_to_ignore != port::kMaxUint64) { assert(input_level == output_level); // inputs_shrunk holds a continuous subset of input files which were all diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 5b444d841af..c3260785db4 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -2225,10 +2225,9 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { BottommostLevelCompaction::kSkip; compact_options.change_level = true; compact_options.target_level = 7; - ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) - .IsNotSupported()); + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); - ASSERT_EQ(trivial_move, 1); + ASSERT_GE(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 387addaa6f4..fb90eba90be 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -5974,26 +5974,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { auto start_idx = key_idx; GenerateNewFile(&rnd, &key_idx); GenerateNewFile(&rnd, &key_idx); - auto end_idx = key_idx - 1; ASSERT_EQ("1,1,2", FilesPerLevel(0)); - // Next two CompactRange() calls are used to test exercise error paths within - // RefitLevel() before triggering a valid RefitLevel() call - - // Trigger a refit to L1 first - { - std::string begin_string = Key(start_idx); - std::string end_string = Key(end_idx); - Slice begin(begin_string); - Slice end(end_string); - - CompactRangeOptions cro; - cro.change_level = true; - cro.target_level = 1; - ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); - } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); + // The next CompactRange() call is used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + // // Try a refit from L2->L1 - this should fail and exercise error paths in // RefitLevel() { @@ -6008,7 +5996,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { cro.target_level = 1; ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); // Try a valid Refit request to ensure, the path is still working { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index b8fedbfeb2a..c371d260cd4 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1085,7 +1085,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, false /* disable_trivial_move */, port::kMaxUint64); } else { int first_overlapped_level = kInvalidLevel; - int max_overlapped_level = kInvalidLevel; { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; @@ -1108,10 +1107,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, begin, end); } if (overlap) { - if (first_overlapped_level == kInvalidLevel) { - first_overlapped_level = level; - } - max_overlapped_level = level; + first_overlapped_level = level; + break; } } CleanupSuperVersion(super_version); @@ -1124,7 +1121,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, cfd, first_overlapped_level, first_overlapped_level, options, begin, end, exclusive, true /* disallow_trivial_move */, std::numeric_limits::max() /* max_file_num_to_ignore */); - final_output_level = max_overlapped_level; + final_output_level = first_overlapped_level; } else { assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); uint64_t next_file_number = versions_->current_next_file_number(); @@ -1135,8 +1132,30 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, // at L1 (or LBase), if applicable. int level = first_overlapped_level; final_output_level = level; - int output_level, base_level; - while (level < max_overlapped_level || level == 0) { + int output_level = 0, base_level = 0; + for (;;) { + // Always allow L0 -> L1 compaction + if (level > 0) { + if (cfd->ioptions()->level_compaction_dynamic_level_bytes) { + assert(final_output_level < cfd->ioptions()->num_levels); + if (final_output_level + 1 == cfd->ioptions()->num_levels) { + break; + } + } else { + // TODO(cbi): there is still a race condition here where + // if a background compaction compacts some file beyond + // current()->storage_info()->num_non_empty_levels() right after + // the check here.This should happen very infrequently and should + // not happen once a user populates the last level of the LSM. + InstrumentedMutexLock l(&mutex_); + // num_non_empty_levels may be lower after a compaction, so + // we check for >= here. + if (final_output_level + 1 >= + cfd->current()->storage_info()->num_non_empty_levels()) { + break; + } + } + } output_level = level + 1; if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { @@ -1167,17 +1186,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, if (s.ok()) { assert(final_output_level > 0); // bottommost level intra-level compaction - // TODO(cbi): this preserves earlier behavior where if - // max_overlapped_level = 0 and bottommost_level_compaction is - // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction - // and do not do intra-LBase compaction even when user configures - // compaction filter. We may want to still do a LBase -> LBase - // compaction in case there is some file in LBase that did not go - // through L0 -> LBase compaction, and hence did not go through - // compaction filter. if ((options.bottommost_level_compaction == BottommostLevelCompaction::kIfHaveCompactionFilter && - max_overlapped_level != 0 && (cfd->ioptions()->compaction_filter != nullptr || cfd->ioptions()->compaction_filter_factory != nullptr)) || options.bottommost_level_compaction == @@ -1185,10 +1195,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, options.bottommost_level_compaction == BottommostLevelCompaction::kForce) { // Use `next_file_number` as `max_file_num_to_ignore` to avoid - // rewriting newly compacted files when it is kForceOptimized. + // rewriting newly compacted files when it is kForceOptimized + // or kIfHaveCompactionFilter with compaction filter set. s = RunManualCompaction( cfd, final_output_level, final_output_level, options, begin, - end, exclusive, false /* disallow_trivial_move */, + end, exclusive, true /* disallow_trivial_move */, next_file_number /* max_file_num_to_ignore */); } } diff --git a/db/db_test2.cc b/db/db_test2.cc index 2f91be2f9cd..9dc0b4282c2 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2206,6 +2206,7 @@ class PinL0IndexAndFilterBlocksTest ASSERT_OK(Flush(1)); // move this table to L1 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(1, NumTableFilesAtLevel(1, 1)); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); @@ -2363,7 +2364,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { // this should be read from L1 value = Get(1, "a"); if (!disallow_preload_) { - // In inifinite max files case, there's a cache miss in executing Get() + // In infinite max files case, there's a cache miss in executing Get() // because index and filter are not prefetched before. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); @@ -2391,12 +2392,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. @@ -2405,12 +2406,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 2eb4812c71e..60aa389022e 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -288,9 +288,9 @@ TEST_F(ManualCompactionTest, SkipLevel) { filter->Reset(); ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(4, filter->NumKeys()); - // 1 is first compacted to L1 and then further compacted into [2, 4, 8], - // so finally the logged level for 1 is L1. - ASSERT_EQ(1, filter->KeyLevel("1")); + // 1 is first compacted from L0 to L1, and then L1 intra level compaction + // compacts [2, 4, 8] only. + ASSERT_EQ(0, filter->KeyLevel("1")); ASSERT_EQ(1, filter->KeyLevel("2")); ASSERT_EQ(1, filter->KeyLevel("4")); ASSERT_EQ(1, filter->KeyLevel("8")); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ee8d89a3b3a..c489412d8bd 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1763,15 +1763,18 @@ struct CompactionOptions { // For level based compaction, we can configure if we want to skip/force // bottommost level compaction. enum class BottommostLevelCompaction { - // Skip bottommost level compaction + // Skip bottommost level compaction. kSkip, - // Only compact bottommost level if there is a compaction filter - // This is the default option + // Only compact bottommost level if there is a compaction filter. + // This is the default option. + // Similar to kForceOptimized, when compacting bottommost level, avoid + // double-compacting files + // created in the same manual compaction. kIfHaveCompactionFilter, - // Always compact bottommost level + // Always compact bottommost level. kForce, // Always compact bottommost level but in bottommost level avoid - // double-compacting files created in the same compaction + // double-compacting files created in the same compaction. kForceOptimized, }; diff --git a/unreleased_history/behavior_changes/compact_range_to_bottom.md b/unreleased_history/behavior_changes/compact_range_to_bottom.md new file mode 100644 index 00000000000..480caed1ed5 --- /dev/null +++ b/unreleased_history/behavior_changes/compact_range_to_bottom.md @@ -0,0 +1,2 @@ +For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) \ No newline at end of file From e7d2fc37e958258fa9b68e074477c78244831a5c Mon Sep 17 00:00:00 2001 From: tabokie Date: Tue, 15 Aug 2023 08:22:55 +0200 Subject: [PATCH 3/3] add check in range interface Signed-off-by: tabokie --- db/db_impl/db_impl.h | 1 + db/db_impl/db_impl_merge.cc | 26 ++++++++++++++++++++++++++ include/rocksdb/db.h | 6 ++++++ 3 files changed, 33 insertions(+) diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 509d0308665..fa73ba58ba2 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -969,6 +969,7 @@ class DBImpl : public DB { Status MergeDisjointInstances(const MergeInstanceOptions& merge_options, const std::vector& instances) override; + Status CheckInRange(const Slice* begin, const Slice* end) override; static IOStatus CreateAndNewDirectory( FileSystem* fs, const std::string& dirname, diff --git a/db/db_impl/db_impl_merge.cc b/db/db_impl/db_impl_merge.cc index 95217feec9b..a046693ed99 100644 --- a/db/db_impl/db_impl_merge.cc +++ b/db/db_impl/db_impl_merge.cc @@ -57,6 +57,32 @@ Status DBImpl::ValidateForMerge(const MergeInstanceOptions& mopts, return Status::OK(); } +Status DBImpl::CheckInRange(const Slice* begin, const Slice* end) { + Status s; + if (begin == nullptr && end == nullptr) { + return s; + } + for (auto cfd : *versions_->GetColumnFamilySet()) { + assert(cfd != nullptr); + auto* comparator = cfd->user_comparator(); + PinnableSlice smallest, largest; + bool found = false; + s = cfd->GetUserKeyRange(&smallest, &largest, &found); + if (!s.ok()) { + return s; + } + if (!found) { + continue; + } + if (begin != nullptr && comparator->Compare(smallest, *begin) < 0) { + return Status::InvalidArgument("Has data smaller than left boundary"); + } else if (end != nullptr && comparator->Compare(largest, *end) >= 0) { + return Status::InvalidArgument("Has data larger than right boundary"); + } + } + return s; +} + Status DBImpl::MergeDisjointInstances(const MergeInstanceOptions& merge_options, const std::vector& instances) { Status s; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 4f95a9865a8..b9c3a7ebaf7 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -322,6 +322,12 @@ class DB { return Status::NotSupported("`MergeDisjointInstances` not implemented"); } + // Check all data written before this call is in the range [begin, end). + // Return InvalidArgument if not. + virtual Status CheckInRange(const Slice* /*begin*/, const Slice* /*end*/) { + return Status::NotSupported("`AssertInRange` not implemented"); + } + virtual Status Resume() { return Status::NotSupported(); } // Close the DB by releasing resources, closing files etc. This should be