diff --git a/HISTORY.md b/HISTORY.md index 00c5e90981c..5ea3b61e4f9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -27,6 +27,7 @@ * For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330). * DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516). * In leveled compaction with dynamic levelling, level multiplier is not anymore adjusted due to oversized L0. Instead, compaction score is adjusted by increasing size level target by adding incoming bytes from upper levels. This would deprioritize compactions from upper levels if more data from L0 is coming. This is to fix some unnecessary full stalling due to drastic change of level targets, while not wasting write bandwidth for compaction while writes are overloaded. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes`. ## 6.29.5 (03/29/2022) ### Bug Fixes diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index cb4ef39714f..6c6183ab2d2 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -520,7 +520,100 @@ Status DBImpl::Recover( if (!s.ok()) { return s; } + if (s.ok() && !read_only) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + // Try to trivially move files down the LSM tree to start from bottommost + // level when level_compaction_dynamic_level_bytes is enabled. This should + // only be useful when user is migrating to turning on this option. + // If a user is migrating from Level Compaction with a smaller level + // multiplier or from Universal Compaction, there may be too many + // non-empty levels and the trivial moves here are not sufficed for + // migration. Additional compactions are needed to drain unnecessary + // levels. + // + // Note that this step moves files down LSM without consulting + // SSTPartitioner. Further compactions are still needed if + // the user wants to partition SST files. + // Note that files moved in this step may not respect the compression + // option in target level. + if (cfd->ioptions()->compaction_style == + CompactionStyle::kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) { + int to_level = cfd->ioptions()->num_levels - 1; + // last level is reserved + if (cfd->ioptions()->allow_ingest_behind) { + to_level -= 1; + } + // Whether this column family has a level trivially moved + bool moved = false; + // Fill the LSM starting from to_level and going up one level at a time. + // Some loop invariants (when last level is not reserved): + // - levels in (from_level, to_level] are empty, and + // - levels in (to_level, last_level] are non-empty. + for (int from_level = to_level; from_level >= 0; --from_level) { + const std::vector& level_files = + cfd->current()->storage_info()->LevelFiles(from_level); + if (level_files.empty() || from_level == 0) { + continue; + } + assert(from_level <= to_level); + // Trivial move files from `from_level` to `to_level` + if (from_level < to_level) { + if (!moved) { + // lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with + // 7 levels + std::string lsm_state = "["; + for (int i = 0; i < cfd->ioptions()->num_levels; ++i) { + lsm_state += std::to_string( + cfd->current()->storage_info()->NumLevelFiles(i)); + if (i < cfd->ioptions()->num_levels - 1) { + lsm_state += ","; + } + } + lsm_state += "]"; + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Trivially move files down the LSM when open " + "with level_compaction_dynamic_level_bytes=true," + " lsm_state: %s (Files are moved only if DB " + "Recovery is successful).", + cfd->GetName().c_str(), lsm_state.c_str()); + moved = true; + } + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "[%s] Moving %zu files from from_level-%d to from_level-%d", + cfd->GetName().c_str(), level_files.size(), from_level, + to_level); + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const FileMetaData* f : level_files) { + edit.DeleteFile(from_level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, + f->temperature, // this can be different from + // `last_level_temperature` + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->min_timestamp, + f->max_timestamp); + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Moving #%" PRIu64 + " from from_level-%d to from_level-%d %" PRIu64 + " bytes\n", + cfd->GetName().c_str(), f->fd.GetNumber(), + from_level, to_level, f->fd.GetFileSize()); + } + } + --to_level; + } + } + } + } s = SetDBId(read_only); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); if (s.ok() && !read_only) { s = DeleteUnreferencedSstFiles(); } @@ -1662,7 +1755,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->mutex_.Lock(); // Handles create_if_missing, error_if_exists uint64_t recovered_seq(kMaxSequenceNumber); - s = impl->Recover(column_families, false, false, false, &recovered_seq); + s = impl->Recover(column_families, false /* read_only */, + false /* error_if_wal_file_exists */, + false /* error_if_data_exists_in_wals */, &recovered_seq); if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); log::Writer* new_log = nullptr; diff --git a/db/version_set.cc b/db/version_set.cc index 8e0e7f149ea..25302bfaea8 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2710,7 +2710,7 @@ void VersionStorageInfo::ComputeCompactionScore( // the level's target size, and 1.0 is the threshold for triggering // compaction. Higher score means higher prioritization. // Now we keep the compaction triggering condition, but consider more - // factors for priorization, while still keeping the 1.0 threshold. + // factors for prioritization, while still keeping the 1.0 threshold. // In order to provide flexibility for reducing score while still // maintaining it to be over 1.0, we scale the original score by 10x // if it is larger than 1.0. @@ -2743,7 +2743,7 @@ void VersionStorageInfo::ComputeCompactionScore( // compaction score for the whole DB. Adding other levels as if // they are L0 files. for (int i = 1; i < num_levels(); i++) { - // Its possible that a subset of the files in a level may be in a + // It's possible that a subset of the files in a level may be in a // compaction, due to delete triggered compaction or trivial move. // In that case, the below check may not catch a level being // compacted as it only checks the first file. The worst that can @@ -2792,7 +2792,7 @@ void VersionStorageInfo::ComputeCompactionScore( // When calculating estimated_compaction_needed_bytes, we assume // L0 is qualified as pending compactions. We will need to make // sure that it qualifies for compaction. - // It might be guafanteed by logic below anyway, but we are + // It might be guaranteed by logic below anyway, but we are // explicit here to make sure we don't stop writes with no // compaction scheduled. score = std::max(score, 1.01); diff --git a/db/version_set.h b/db/version_set.h index 223c8042520..96492c67a1a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1068,7 +1068,7 @@ class VersionSet { uint64_t* manifest_file_number); void WakeUpWaitingManifestWriters(); - // Recover the last saved descriptor from persistent storage. + // Recover the last saved descriptor (MANIFEST) from persistent storage. // If read_only == true, Recover() will not complain if some column families // are not opened Status Recover(const std::vector& column_families, diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 8b258dc3747..913e889acbf 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -587,8 +587,31 @@ struct AdvancedColumnFamilyOptions { // // max_bytes_for_level_multiplier_additional is ignored with this flag on. // - // Turning this feature on or off for an existing DB can cause unexpected - // LSM tree structure so it's not recommended. + // To make the migration easier, when turning this feature on, files in the + // LSM will be trivially moved down to fill the LSM starting from the + // bottommost level during DB open. For example, if the LSM looks like: + // L0: f0, f1 + // L1: f2, f3 + // L2: f4 + // L3: + // L4: f5 + // and the DB is opened with num_levels = 7 with this feature turned on, + // new LSM after DB open looks like the following: + // L0: f0, f1, (and possibly data flushed from WAL) + // L4: f2, f3 + // L5: f4 + // L6: f5 + // + // If `allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`, + // then the last level is reserved, and we will start filling LSM from the + // second last level (L5 in the above example). + // + // Note that there may be excessive levels (where target level size is 0 when + // computed based on this feature) in the LSM after a user migrates to turn + // this feature on. This is especially likely when a user migrates from + // leveled compaction with a smaller multiplier or from universal compaction. + // A full manual compaction is needed to drain these levels explicitly. + // // // Default: false bool level_compaction_dynamic_level_bytes = false; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index bd956108790..7b9357b432f 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -247,7 +247,7 @@ def is_direct_io_supported(dbname): "target_file_size_multiplier": 1, "test_batches_snapshots": 0, "write_buffer_size": 32 * 1024 * 1024, - "level_compaction_dynamic_level_bytes": False, + "level_compaction_dynamic_level_bytes": lambda: random.randint(0, 1), "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]), }