diff --git a/HISTORY.md b/HISTORY.md index 5ea3b61e4f9..a672a9451b3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -27,7 +27,8 @@ * For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330). * DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516). * In leveled compaction with dynamic levelling, level multiplier is not anymore adjusted due to oversized L0. Instead, compaction score is adjusted by increasing size level target by adding incoming bytes from upper levels. This would deprioritize compactions from upper levels if more data from L0 is coming. This is to fix some unnecessary full stalling due to drastic change of level targets, while not wasting write bandwidth for compaction while writes are overloaded. -* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes`. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes`. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp. ## 6.29.5 (03/29/2022) ### Bug Fixes diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index da325a8a2de..eb3dbd6c7bf 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -200,7 +200,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { TEST_F(CompactFilesTest, MultipleLevel) { Options options; options.create_if_missing = true; - options.level_compaction_dynamic_level_bytes = true; + // Otherwise background compaction can happen to + // drain unnecessary level + options.level_compaction_dynamic_level_bytes = false; options.num_levels = 6; // Add listener FlushedFileCollector* collector = new FlushedFileCollector(); @@ -258,7 +260,6 @@ TEST_F(CompactFilesTest, MultipleLevel) { for (int invalid_output_level = 0; invalid_output_level < 5; invalid_output_level++) { s = db->CompactFiles(CompactionOptions(), files, invalid_output_level); - std::cout << s.ToString() << std::endl; ASSERT_TRUE(s.IsInvalidArgument()); } diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 66f8cab7d1d..33e4e1ee70b 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -7429,6 +7429,139 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) { #endif // !defined(ROCKSDB_LITE) +TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterMultiplierChanged) { + // When the level size multiplier increases such that fewer levels become + // necessary, unnecessary levels should to be drained. + const int kBaseLevelBytes = 256 << 10; // 256KB + const int kFileBytes = 64 << 10; // 64KB + const int kInitMultiplier = 2, kChangedMultiplier = 10; + const int kNumFiles = 32; + const int kNumLevels = 5; + const int kValueBytes = 1 << 10; // 1KB + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kInitMultiplier; + options.num_levels = kNumLevels; + Reopen(options); + + // Initially we setup the LSM to look roughly as follows: + // + // L0: empty + // L1: 256KB + // ... + // L4: 1MB + Random rnd(301); + for (int file = 0; file < kNumFiles; ++file) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + ASSERT_OK(Put(Key(file * kFileBytes / kValueBytes + i), + rnd.RandomString(kValueBytes))); + } + ASSERT_OK(Flush()); + } + + int init_num_nonempty = 0; + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++init_num_nonempty; + } + } + + // After increasing the multiplier and running compaction fewer levels are + // needed to hold all the data. Unnecessary levels should be drained. + ASSERT_OK(db_->SetOptions({{"max_bytes_for_level_multiplier", + std::to_string(kChangedMultiplier)}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int final_num_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++final_num_nonempty; + } + } + ASSERT_GT(init_num_nonempty, final_num_nonempty); +} + +// Disabled because `CompactRange` doesn't work as expected. +TEST_F(DBCompactionTest, DISABLED_DrainUnnecessaryLevelsAfterDBBecomesSmall) { + // When the DB size is smaller, e.g., large chunk of data deleted by + // DeleteRange(), unnecessary levels should to be drained. + const int kBaseLevelBytes = 256 << 10; // 256KB + const int kFileBytes = 64 << 10; // 64KB + const int kMultiplier = 2; + const int kNumFiles = 32; + const int kNumLevels = 5; + const int kValueBytes = 1 << 10; // 1KB + const int kDeleteFileNum = 8; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.num_levels = kNumLevels; + Reopen(options); + + // Initially we setup the LSM to look roughly as follows: + // + // L0: empty + // L1: 256KB + // ... + // L4: 1MB + Random rnd(301); + for (int file = 0; file < kNumFiles; ++file) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + ASSERT_OK(Put(Key(file * kFileBytes / kValueBytes + i), + rnd.RandomString(kValueBytes))); + } + ASSERT_OK(Flush()); + if (file == kDeleteFileNum) { + // Ensure the DeleteRange() call below only delete data from last level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(kNumLevels - 1), kDeleteFileNum + 1); + } + } + + int init_num_nonempty = 0; + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++init_num_nonempty; + } + } + + // Disable auto compaction CompactRange() below + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + // Delete keys within first (kDeleteFileNum + 1) files' key ranges. + // This should reduce DB size enough such that there is now + // an unneeded level. + std::string begin = Key(0); + std::string end = Key(kDeleteFileNum * kFileBytes / kValueBytes); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), begin, end)); + Slice begin_slice = begin; + Slice end_slice = end; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_slice, &end_slice)); + int after_delete_range_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++after_delete_range_nonempty; + } + } + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int final_num_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++final_num_nonempty; + } + } + ASSERT_GE(init_num_nonempty, after_delete_range_nonempty); + ASSERT_GT(after_delete_range_nonempty, final_num_nonempty); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 6c6183ab2d2..b0ad3147224 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -542,6 +542,7 @@ Status DBImpl::Recover( !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) { int to_level = cfd->ioptions()->num_levels - 1; // last level is reserved + // allow_ingest_behind does not support Level Compaction. if (cfd->ioptions()->allow_ingest_behind) { to_level -= 1; } diff --git a/db/db_test.cc b/db/db_test.cc index f2373ad1459..934177d9557 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -360,7 +360,7 @@ TEST_F(DBTest, MixedSlowdownOptionsInQueue) { for (int i = 0; i < 2; ++i) { threads.emplace_back(write_no_slowdown_func); } - // Sleep for 2s to allow the threads to insert themselves into the + // Sleep for 3s to allow the threads to insert themselves into the // write queue env_->SleepForMicroseconds(3000000ULL); } @@ -431,7 +431,7 @@ TEST_F(DBTest, MixedSlowdownOptionsStop) { for (int i = 0; i < 2; ++i) { threads.emplace_back(write_no_slowdown_func); } - // Sleep for 2s to allow the threads to insert themselves into the + // Sleep for 3s to allow the threads to insert themselves into the // write queue env_->SleepForMicroseconds(3000000ULL); } diff --git a/db/version_set.cc b/db/version_set.cc index 25302bfaea8..d85aec213d2 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2821,7 +2821,7 @@ void VersionStorageInfo::ComputeCompactionScore( } } } - } else { + } else { // level > 0 // Compute the ratio of current size to size limit. uint64_t level_bytes_no_compacting = 0; uint64_t level_total_bytes = 0; @@ -2831,21 +2831,36 @@ void VersionStorageInfo::ComputeCompactionScore( level_bytes_no_compacting += f->compensated_file_size; } } - if (!immutable_options.level_compaction_dynamic_level_bytes || - level_bytes_no_compacting < MaxBytesForLevel(level)) { + if (!immutable_options.level_compaction_dynamic_level_bytes) { score = static_cast(level_bytes_no_compacting) / MaxBytesForLevel(level); } else { - // If there are a large mount of data being compacted down to the - // current level soon, we would de-prioritize compaction from - // a level where the incoming data would be a large ratio. We do - // it by dividing level size not by target level size, but - // the target size and the incoming compaction bytes. - score = static_cast(level_bytes_no_compacting) / - (MaxBytesForLevel(level) + total_downcompact_bytes) * - kScoreScale; + if (level_bytes_no_compacting < MaxBytesForLevel(level)) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + // If there are a large mount of data being compacted down to the + // current level soon, we would de-prioritize compaction from + // a level where the incoming data would be a large ratio. We do + // it by dividing level size not by target level size, but + // the target size and the incoming compaction bytes. + score = static_cast(level_bytes_no_compacting) / + (MaxBytesForLevel(level) + total_downcompact_bytes) * + kScoreScale; + } + // Drain unnecessary levels, but with lower priority compared to + // when L0 is eligible. Only non-empty levels can be unnecessary. + // If there is no unnecessary levels, lowest_unnecessary_level_ = -1. + if (level_bytes_no_compacting > 0 && + level <= lowest_unnecessary_level_) { + score = std::max( + score, kScoreScale * + (1.001 + 0.001 * (lowest_unnecessary_level_ - level))); + } } - if (level_total_bytes > MaxBytesForLevel(level)) { + if (level <= lowest_unnecessary_level_) { + total_downcompact_bytes += level_total_bytes; + } else if (level_total_bytes > MaxBytesForLevel(level)) { total_downcompact_bytes += static_cast(level_total_bytes - MaxBytesForLevel(level)); } @@ -3758,6 +3773,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, } } } else { + assert(ioptions.compaction_style == kCompactionStyleLevel); uint64_t max_level_size = 0; int first_non_empty_level = -1; @@ -3782,11 +3798,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, level_max_bytes_[i] = std::numeric_limits::max(); } + lowest_unnecessary_level_ = -1; if (max_level_size == 0) { // No data for L1 and up. L0 compacts to last level directly. // No compaction from L1+ needs to be scheduled. base_level_ = num_levels_ - 1; } else { + assert(first_non_empty_level >= 1); uint64_t base_bytes_max = options.max_bytes_for_level_base; uint64_t base_bytes_min = static_cast( base_bytes_max / options.max_bytes_for_level_multiplier); @@ -3797,20 +3815,38 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, // Round up after dividing cur_level_size = static_cast( cur_level_size / options.max_bytes_for_level_multiplier); + if (lowest_unnecessary_level_ == -1 && + cur_level_size <= base_bytes_min) { + // When per_key_placement is enabled, the penultimate level is + // necessary. + lowest_unnecessary_level_ = i; + } } // Calculate base level and its size. uint64_t base_level_size; if (cur_level_size <= base_bytes_min) { + // If per_key_placement is not enabled, + // either there is only one non-empty level after level 0, + // which can less than base_bytes_min AND necessary, + // or there is some unnecessary level. + assert(first_non_empty_level == num_levels_ - 1 || + lowest_unnecessary_level_ != -1); // Case 1. If we make target size of last level to be max_level_size, // target size of the first non-empty level would be smaller than // base_bytes_min. We set it be base_bytes_min. base_level_size = base_bytes_min + 1U; base_level_ = first_non_empty_level; - ROCKS_LOG_INFO(ioptions.logger, - "More existing levels in DB than needed. " - "max_bytes_for_level_multiplier may not be guaranteed."); + if (base_level_ < num_levels_ - 1) { + ROCKS_LOG_INFO( + ioptions.logger, + "More existing levels in DB than needed: all non-zero " + "levels <= level %d are unnecessary. " + "max_bytes_for_level_multiplier may not be guaranteed.", + lowest_unnecessary_level_); + } } else { + assert(lowest_unnecessary_level_ == -1); // Find base level (where L0 data is compacted to). base_level_ = first_non_empty_level; while (base_level_ > 1 && cur_level_size > base_bytes_max) { diff --git a/db/version_set.h b/db/version_set.h index 96492c67a1a..59c365a174e 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -576,6 +576,12 @@ class VersionStorageInfo { // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; + // Applies to level compaction when + // `level_compaction_dynamic_level_bytes=true`. All non-empty levels <= + // lowest_unnecessary_level_ are not needed and will be drained automatically. + // -1 if there is no unnecessary level, + int lowest_unnecessary_level_; + double level_multiplier_; // A list for the same set of files that are stored in files_, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index e8d75108ec5..bbc90a44bab 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -204,6 +204,16 @@ class VersionStorageInfoTestBase : public testing::Test { } return result; } + + void UpdateVersionStorageInfo() { + vstorage_.UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_); + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.GenerateFileIndexer(); + vstorage_.GenerateLevelFilesBrief(); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_.GenerateLevel0NonOverlapping(); + vstorage_.SetFinalized(); + } }; class VersionStorageInfoTest : public VersionStorageInfoTestBase { @@ -404,6 +414,37 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2)); } +TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // Create a few unnecessary levels. + // See if score is calculated correctly. + Add(5, 1U, "1", "2", 2000U); // target size 1010000 + Add(4, 2U, "1", "2", 200U); // target size 101000 + // Unnecessary levels + Add(3, 3U, "1", "2", 100U); // target size 10100 + // Level 2: target size 1010 + Add(1, 4U, "1", "2", + 10U); // target size 1000 = max(base_bytes_min + 1, base_bytes_max) + + UpdateVersionStorageInfo(); + + ASSERT_EQ(1, vstorage_.base_level()); + ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1)); + ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3)); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + + // Tests that levels 1 and 3 are eligible for compaction. + // Levels 1 and 3 are much smaller than target size, + // so size does not contribute to a high compaction score. + ASSERT_EQ(1, vstorage_.CompactionScoreLevel(0)); + ASSERT_GT(vstorage_.CompactionScore(0), 10); + ASSERT_EQ(3, vstorage_.CompactionScoreLevel(1)); + ASSERT_GT(vstorage_.CompactionScore(1), 10); +} + TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { // Test whether the overlaps are detected as expected Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 913e889acbf..f552ace0a0f 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -537,7 +537,7 @@ struct AdvancedColumnFamilyOptions { // and max_bytes_for_level_base=10MB. // Target sizes of level 1 to 5 starts with: // [- - - - 10MB] - // with base level is level. Target sizes of level 1 to 4 are not applicable + // with base level is level 5. Target sizes of level 1 to 4 are not applicable // because they will not be used. // Until the size of Level 5 grows to more than 10MB, say 11MB, we make // base target to level 4 and now the targets looks like: @@ -610,8 +610,8 @@ struct AdvancedColumnFamilyOptions { // computed based on this feature) in the LSM after a user migrates to turn // this feature on. This is especially likely when a user migrates from // leveled compaction with a smaller multiplier or from universal compaction. - // A full manual compaction is needed to drain these levels explicitly. - // + // RocksDB will gradually drain these unnecessary levels by compacting files + // down the LSM. // // Default: false bool level_compaction_dynamic_level_bytes = false;