diff --git a/HISTORY.md b/HISTORY.md index 1297cf2e11f8..6abd5737f18f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -27,6 +27,8 @@ * For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes`. * For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp. * If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375). +* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) ## 6.29.5 (03/29/2022) ### Bug Fixes diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 42d7f8b2cc8e..fe9bd096e951 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -717,8 +717,10 @@ Compaction* CompactionPicker::CompactRange( // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out // files that are created during the current compaction. - if (compact_range_options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized && + if ((compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter) && max_file_num_to_ignore != port::kMaxUint64) { assert(input_level == output_level); // inputs_shrunk holds a continuous subset of input files which were all diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 5b444d841af2..f75a190f23e9 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -2228,7 +2228,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) .IsNotSupported()); - ASSERT_EQ(trivial_move, 1); + ASSERT_GE(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 66ea698d5352..12fe18bc947a 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -6035,26 +6035,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { auto start_idx = key_idx; GenerateNewFile(&rnd, &key_idx); GenerateNewFile(&rnd, &key_idx); - auto end_idx = key_idx - 1; ASSERT_EQ("1,1,2", FilesPerLevel(0)); - // Next two CompactRange() calls are used to test exercise error paths within - // RefitLevel() before triggering a valid RefitLevel() call - - // Trigger a refit to L1 first - { - std::string begin_string = Key(start_idx); - std::string end_string = Key(end_idx); - Slice begin(begin_string); - Slice end(end_string); - - CompactRangeOptions cro; - cro.change_level = true; - cro.target_level = 1; - ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); - } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); + // The next CompactRange() call is used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + // // Try a refit from L2->L1 - this should fail and exercise error paths in // RefitLevel() { @@ -6069,7 +6057,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { cro.target_level = 1; ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); // Try a valid Refit request to ensure, the path is still working { @@ -7497,6 +7485,134 @@ TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterDBBecomesSmall) { ASSERT_GT(after_delete_range_nonempty, final_num_nonempty); } +TEST_F(DBCompactionTest, ManualCompactionCompactAllKeysInRange) { + // CompactRange() used to pre-compute target level to compact to + // before running compactions. However, the files at target level + // could be trivially moved down by some background compaction. This means + // some keys in the manual compaction key range may not be compacted + // during the manual compaction. This unit test tests this scenario. + // A fix has been applied for this scenario to always compact + // to the bottommost level. + const int kBaseLevelBytes = 8 << 20; // 8MB + const int kMultiplier = 2; + Options options = CurrentOptions(); + options.num_levels = 7; + options.level_compaction_dynamic_level_bytes = false; + options.compaction_style = kCompactionStyleLevel; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.compression = kNoCompression; + options.target_file_size_base = 2 * kBaseLevelBytes; + + DestroyAndReopen(options); + Random rnd(301); + // Populate L2 so that manual compaction will compact to at least L2. + // Otherwise, there is still a possibility of race condition where + // the manual compaction thread believes that max non-empty level is L1 + // while there is some auto compaction that moves some files from L1 to L2. + ASSERT_OK(db_->Put(WriteOptions(), Key(1000), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // one file in L1: [Key(5), Key(6)] + ASSERT_OK( + db_->Put(WriteOptions(), Key(5), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(6), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK( + db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 2))); + // We now do manual compaction for key range [Key(1), Key(6)]. + // First it compacts file [Key(1)] to L1. + // L1 will have two files [Key(1)], and [Key(5), Key(6)]. + // After L0 -> L1 manual compaction, an automatic compaction will trivially + // move both files from L1 to L2. Here the dependency makes manual compaction + // wait for auto-compaction to pick a compaction before proceeding. Manual + // compaction should not stop at L1 and keep compacting L2. With kForce + // specified, expected output is that manual compaction compacts to L2 and L2 + // will contain 2 files: one for Key(1000) and one for Key(1), Key(5) and + // Key(6). + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "DBImpl::RunManualCompaction()::1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + std::string begin_str = Key(1); + std::string end_str = Key(6); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + + ASSERT_EQ(NumTableFilesAtLevel(2), 2); +} + +TEST_F(DBCompactionTest, + ManualCompactionCompactAllKeysInRangeDynamicLevelBytes) { + // Similar to the test above (ManualCompactionCompactAllKeysInRange), but with + // level_compaction_dynamic_level_bytes = true. + const int kBaseLevelBytes = 8 << 20; // 8MB + const int kMultiplier = 2; + Options options = CurrentOptions(); + options.num_levels = 7; + options.level_compaction_dynamic_level_bytes = true; + options.compaction_style = kCompactionStyleLevel; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.compression = kNoCompression; + options.target_file_size_base = 2 * kBaseLevelBytes; + DestroyAndReopen(options); + + Random rnd(301); + ASSERT_OK(db_->Put(WriteOptions(), Key(5), + rnd.RandomString(3 * kBaseLevelBytes / 2))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + // L6 now has one file with size ~ 3/2 * kBaseLevelBytes. + // L5 is the new base level, with target size ~ 3/4 * kBaseLevelBytes. + + ASSERT_OK( + db_->Put(WriteOptions(), Key(3), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(4), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK(Flush()); + + MoveFilesToLevel(5); + ASSERT_EQ(1, NumTableFilesAtLevel(5)); + // L5 now has one file with size ~ 2/3 * kBaseLevelBytes, which is below its + // target size. + + ASSERT_OK( + db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(2), rnd.RandomString(kBaseLevelBytes / 3))); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "DBImpl::RunManualCompaction()::1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + // After compacting the file with [Key(1), Key(2)] to L5, + // L5 has size ~ 4/3 * kBaseLevelBytes > its target size. + // We let manual compaction wait for an auto-compaction to pick + // a compaction before proceeding. The auto-compaction would + // trivially move both files in L5 down to L6. If manual compaction + // works correctly with kForce specified, it should rewrite the two files in + // L6 into a single file. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + std::string begin_str = Key(1); + std::string end_str = Key(4); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index d8d11dc6114a..334bbbf99dac 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1085,7 +1085,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, false /* disable_trivial_move */, port::kMaxUint64); } else { int first_overlapped_level = kInvalidLevel; - int max_overlapped_level = kInvalidLevel; { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; @@ -1108,10 +1107,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, begin, end); } if (overlap) { - if (first_overlapped_level == kInvalidLevel) { - first_overlapped_level = level; - } - max_overlapped_level = level; + first_overlapped_level = level; + break; } } CleanupSuperVersion(super_version); @@ -1125,7 +1122,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, end, exclusive, true /* disallow_trivial_move */, std::numeric_limits::max() /* max_file_num_to_ignore */, trim_ts); - final_output_level = max_overlapped_level; + final_output_level = first_overlapped_level; } else { assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); uint64_t next_file_number = versions_->current_next_file_number(); @@ -1136,8 +1133,30 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, // at L1 (or LBase), if applicable. int level = first_overlapped_level; final_output_level = level; - int output_level, base_level; - while (level < max_overlapped_level || level == 0) { + int output_level = 0, base_level = 0; + for (;;) { + // Always allow L0 -> L1 compaction + if (level > 0) { + if (cfd->ioptions()->level_compaction_dynamic_level_bytes) { + assert(final_output_level < cfd->ioptions()->num_levels); + if (final_output_level + 1 == cfd->ioptions()->num_levels) { + break; + } + } else { + // TODO(cbi): there is still a race condition here where + // if a background compaction compacts some file beyond + // current()->storage_info()->num_non_empty_levels() right after + // the check here.This should happen very infrequently and should + // not happen once a user populates the last level of the LSM. + InstrumentedMutexLock l(&mutex_); + // num_non_empty_levels may be lower after a compaction, so + // we check for >= here. + if (final_output_level + 1 >= + cfd->current()->storage_info()->num_non_empty_levels()) { + break; + } + } + } output_level = level + 1; if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { @@ -1169,17 +1188,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, if (s.ok()) { assert(final_output_level > 0); // bottommost level intra-level compaction - // TODO(cbi): this preserves earlier behavior where if - // max_overlapped_level = 0 and bottommost_level_compaction is - // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction - // and do not do intra-LBase compaction even when user configures - // compaction filter. We may want to still do a LBase -> LBase - // compaction in case there is some file in LBase that did not go - // through L0 -> LBase compaction, and hence did not go through - // compaction filter. if ((options.bottommost_level_compaction == BottommostLevelCompaction::kIfHaveCompactionFilter && - max_overlapped_level != 0 && (cfd->ioptions()->compaction_filter != nullptr || cfd->ioptions()->compaction_filter_factory != nullptr)) || options.bottommost_level_compaction == @@ -1187,10 +1197,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, options.bottommost_level_compaction == BottommostLevelCompaction::kForce) { // Use `next_file_number` as `max_file_num_to_ignore` to avoid - // rewriting newly compacted files when it is kForceOptimized. + // rewriting newly compacted files when it is kForceOptimized + // or kIfHaveCompactionFilter with compaction filter set. s = RunManualCompaction( cfd, final_output_level, final_output_level, options, begin, - end, exclusive, !trim_ts.empty() /* disallow_trivial_move */, + end, exclusive, true /* disallow_trivial_move */, next_file_number /* max_file_num_to_ignore */, trim_ts); } } diff --git a/db/db_test2.cc b/db/db_test2.cc index 2f91be2f9cda..9dc0b4282c2c 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -2206,6 +2206,7 @@ class PinL0IndexAndFilterBlocksTest ASSERT_OK(Flush(1)); // move this table to L1 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(1, NumTableFilesAtLevel(1, 1)); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); @@ -2363,7 +2364,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { // this should be read from L1 value = Get(1, "a"); if (!disallow_preload_) { - // In inifinite max files case, there's a cache miss in executing Get() + // In infinite max files case, there's a cache miss in executing Get() // because index and filter are not prefetched before. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); @@ -2391,12 +2392,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. @@ -2405,12 +2406,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 2eb4812c71e2..60aa389022e3 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -288,9 +288,9 @@ TEST_F(ManualCompactionTest, SkipLevel) { filter->Reset(); ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(4, filter->NumKeys()); - // 1 is first compacted to L1 and then further compacted into [2, 4, 8], - // so finally the logged level for 1 is L1. - ASSERT_EQ(1, filter->KeyLevel("1")); + // 1 is first compacted from L0 to L1, and then L1 intra level compaction + // compacts [2, 4, 8] only. + ASSERT_EQ(0, filter->KeyLevel("1")); ASSERT_EQ(1, filter->KeyLevel("2")); ASSERT_EQ(1, filter->KeyLevel("4")); ASSERT_EQ(1, filter->KeyLevel("8")); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ee8d89a3b3a1..c489412d8bdb 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1763,15 +1763,18 @@ struct CompactionOptions { // For level based compaction, we can configure if we want to skip/force // bottommost level compaction. enum class BottommostLevelCompaction { - // Skip bottommost level compaction + // Skip bottommost level compaction. kSkip, - // Only compact bottommost level if there is a compaction filter - // This is the default option + // Only compact bottommost level if there is a compaction filter. + // This is the default option. + // Similar to kForceOptimized, when compacting bottommost level, avoid + // double-compacting files + // created in the same manual compaction. kIfHaveCompactionFilter, - // Always compact bottommost level + // Always compact bottommost level. kForce, // Always compact bottommost level but in bottommost level avoid - // double-compacting files created in the same compaction + // double-compacting files created in the same compaction. kForceOptimized, }; diff --git a/unreleased_history/behavior_changes/compact_range_to_bottom.md b/unreleased_history/behavior_changes/compact_range_to_bottom.md new file mode 100644 index 000000000000..480caed1ed5b --- /dev/null +++ b/unreleased_history/behavior_changes/compact_range_to_bottom.md @@ -0,0 +1,2 @@ +For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) \ No newline at end of file