Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CompactRange() always compacts to bottommost level for leveled compaction #11468

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions db/compaction/compaction_picker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -753,8 +753,10 @@ Compaction* CompactionPicker::CompactRange(

// for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
// files that are created during the current compaction.
if (compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized &&
if ((compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized ||
compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kIfHaveCompactionFilter) &&
max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
assert(input_level == output_level);
// inputs_shrunk holds a continuous subset of input files which were all
Expand Down
2 changes: 1 addition & 1 deletion db/db_bloom_filter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2614,7 +2614,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
compact_options.target_level = 7;
ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));

ASSERT_EQ(trivial_move, 1);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple trivial moves to compact to the bottommost level.

ASSERT_GE(trivial_move, 1);
ASSERT_EQ(non_trivial_move, 0);

prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
Expand Down
152 changes: 134 additions & 18 deletions db/db_compaction_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7966,26 +7966,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
auto start_idx = key_idx;
GenerateNewFile(&rnd, &key_idx);
GenerateNewFile(&rnd, &key_idx);
auto end_idx = key_idx - 1;
ASSERT_EQ("1,1,2", FilesPerLevel(0));

// Next two CompactRange() calls are used to test exercise error paths within
// RefitLevel() before triggering a valid RefitLevel() call

// Trigger a refit to L1 first
{
std::string begin_string = Key(start_idx);
std::string end_string = Key(end_idx);
Slice begin(begin_string);
Slice end(end_string);

CompactRangeOptions cro;
cro.change_level = true;
cro.target_level = 1;
ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
}
ASSERT_EQ("0,3,2", FilesPerLevel(0));
MoveFilesToLevel(1);
ASSERT_EQ("0,2,2", FilesPerLevel(0));

// The next CompactRange() call is used to test exercise error paths within
// RefitLevel() before triggering a valid RefitLevel() call
//
// Try a refit from L2->L1 - this should fail and exercise error paths in
// RefitLevel()
{
Expand All @@ -8000,7 +7988,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
cro.target_level = 1;
ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
}
ASSERT_EQ("0,3,2", FilesPerLevel(0));
ASSERT_EQ("0,2,2", FilesPerLevel(0));

// Try a valid Refit request to ensure, the path is still working
{
Expand Down Expand Up @@ -9637,6 +9625,134 @@ TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterDBBecomesSmall) {
ASSERT_GT(after_delete_range_nonempty, final_num_nonempty);
}

TEST_F(DBCompactionTest, ManualCompactionCompactAllKeysInRange) {
// CompactRange() used to pre-compute target level to compact to
// before running compactions. However, the files at target level
// could be trivially moved down by some background compaction. This means
// some keys in the manual compaction key range may not be compacted
// during the manual compaction. This unit test tests this scenario.
// A fix has been applied for this scenario to always compact
// to the bottommost level.
const int kBaseLevelBytes = 8 << 20; // 8MB
const int kMultiplier = 2;
Options options = CurrentOptions();
options.num_levels = 7;
options.level_compaction_dynamic_level_bytes = false;
options.compaction_style = kCompactionStyleLevel;
options.max_bytes_for_level_base = kBaseLevelBytes;
options.max_bytes_for_level_multiplier = kMultiplier;
options.compression = kNoCompression;
options.target_file_size_base = 2 * kBaseLevelBytes;

DestroyAndReopen(options);
Random rnd(301);
// Populate L2 so that manual compaction will compact to at least L2.
// Otherwise, there is still a possibility of race condition where
// the manual compaction thread believes that max non-empty level is L1
// while there is some auto compaction that moves some files from L1 to L2.
ASSERT_OK(db_->Put(WriteOptions(), Key(1000), rnd.RandomString(100)));
ASSERT_OK(Flush());
MoveFilesToLevel(2);
ASSERT_EQ(1, NumTableFilesAtLevel(2));

// one file in L1: [Key(5), Key(6)]
ASSERT_OK(
db_->Put(WriteOptions(), Key(5), rnd.RandomString(kBaseLevelBytes / 3)));
ASSERT_OK(
db_->Put(WriteOptions(), Key(6), rnd.RandomString(kBaseLevelBytes / 3)));
ASSERT_OK(Flush());
MoveFilesToLevel(1);
ASSERT_EQ(1, NumTableFilesAtLevel(1));

ASSERT_OK(
db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 2)));
// We now do manual compaction for key range [Key(1), Key(6)].
// First it compacts file [Key(1)] to L1.
// L1 will have two files [Key(1)], and [Key(5), Key(6)].
// After L0 -> L1 manual compaction, an automatic compaction will trivially
// move both files from L1 to L2. Here the dependency makes manual compaction
// wait for auto-compaction to pick a compaction before proceeding. Manual
// compaction should not stop at L1 and keep compacting L2. With kForce
// specified, expected output is that manual compaction compacts to L2 and L2
// will contain 2 files: one for Key(1000) and one for Key(1), Key(5) and
// Key(6).
SyncPoint::GetInstance()->LoadDependency(
{{"DBImpl::BackgroundCompaction():AfterPickCompaction",
"DBImpl::RunManualCompaction()::1"}});
SyncPoint::GetInstance()->EnableProcessing();
std::string begin_str = Key(1);
std::string end_str = Key(6);
Slice begin_slice = begin_str;
Slice end_slice = end_str;
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice));

ASSERT_EQ(NumTableFilesAtLevel(2), 2);
}

TEST_F(DBCompactionTest,
ManualCompactionCompactAllKeysInRangeDynamicLevelBytes) {
// Similar to the test above (ManualCompactionCompactAllKeysInRange), but with
// level_compaction_dynamic_level_bytes = true.
const int kBaseLevelBytes = 8 << 20; // 8MB
const int kMultiplier = 2;
Options options = CurrentOptions();
options.num_levels = 7;
options.level_compaction_dynamic_level_bytes = true;
options.compaction_style = kCompactionStyleLevel;
options.max_bytes_for_level_base = kBaseLevelBytes;
options.max_bytes_for_level_multiplier = kMultiplier;
options.compression = kNoCompression;
options.target_file_size_base = 2 * kBaseLevelBytes;
DestroyAndReopen(options);

Random rnd(301);
ASSERT_OK(db_->Put(WriteOptions(), Key(5),
rnd.RandomString(3 * kBaseLevelBytes / 2)));
ASSERT_OK(Flush());
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
ASSERT_EQ(1, NumTableFilesAtLevel(6));
// L6 now has one file with size ~ 3/2 * kBaseLevelBytes.
// L5 is the new base level, with target size ~ 3/4 * kBaseLevelBytes.

ASSERT_OK(
db_->Put(WriteOptions(), Key(3), rnd.RandomString(kBaseLevelBytes / 3)));
ASSERT_OK(
db_->Put(WriteOptions(), Key(4), rnd.RandomString(kBaseLevelBytes / 3)));
ASSERT_OK(Flush());

MoveFilesToLevel(5);
ASSERT_EQ(1, NumTableFilesAtLevel(5));
// L5 now has one file with size ~ 2/3 * kBaseLevelBytes, which is below its
// target size.

ASSERT_OK(
db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 3)));
ASSERT_OK(
db_->Put(WriteOptions(), Key(2), rnd.RandomString(kBaseLevelBytes / 3)));
SyncPoint::GetInstance()->LoadDependency(
{{"DBImpl::BackgroundCompaction():AfterPickCompaction",
"DBImpl::RunManualCompaction()::1"}});
SyncPoint::GetInstance()->EnableProcessing();
// After compacting the file with [Key(1), Key(2)] to L5,
// L5 has size ~ 4/3 * kBaseLevelBytes > its target size.
// We let manual compaction wait for an auto-compaction to pick
// a compaction before proceeding. The auto-compaction would
// trivially move both files in L5 down to L6. If manual compaction
// works correctly with kForce specified, it should rewrite the two files in
// L6 into a single file.
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
std::string begin_str = Key(1);
std::string end_str = Key(4);
Slice begin_slice = begin_str;
Slice end_slice = end_str;
ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice));
ASSERT_EQ(2, NumTableFilesAtLevel(6));
ASSERT_EQ(0, NumTableFilesAtLevel(5));
}

} // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
Expand Down
47 changes: 29 additions & 18 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
std::numeric_limits<uint64_t>::max(), trim_ts);
} else {
int first_overlapped_level = kInvalidLevel;
int max_overlapped_level = kInvalidLevel;
{
SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
Version* current_version = super_version->current;
Expand Down Expand Up @@ -1142,10 +1141,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
begin, end);
}
if (overlap) {
if (first_overlapped_level == kInvalidLevel) {
first_overlapped_level = level;
}
max_overlapped_level = level;
first_overlapped_level = level;
break;
}
}
CleanupSuperVersion(super_version);
Expand All @@ -1159,7 +1156,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
end, exclusive, true /* disallow_trivial_move */,
std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
trim_ts);
final_output_level = max_overlapped_level;
final_output_level = first_overlapped_level;
} else {
assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel);
uint64_t next_file_number = versions_->current_next_file_number();
Expand All @@ -1171,7 +1168,29 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
int level = first_overlapped_level;
final_output_level = level;
int output_level = 0, base_level = 0;
while (level < max_overlapped_level || level == 0) {
for (;;) {
// Always allow L0 -> L1 compaction
if (level > 0) {
if (cfd->ioptions()->level_compaction_dynamic_level_bytes) {
assert(final_output_level < cfd->ioptions()->num_levels);
if (final_output_level + 1 == cfd->ioptions()->num_levels) {
break;
}
} else {
// TODO(cbi): there is still a race condition here where
// if a background compaction compacts some file beyond
// current()->storage_info()->num_non_empty_levels() right after
// the check here.This should happen very infrequently and should
// not happen once a user populates the last level of the LSM.
InstrumentedMutexLock l(&mutex_);
// num_non_empty_levels may be lower after a compaction, so
// we check for >= here.
if (final_output_level + 1 >=
cfd->current()->storage_info()->num_non_empty_levels()) {
break;
}
}
}
output_level = level + 1;
if (cfd->ioptions()->level_compaction_dynamic_level_bytes &&
level == 0) {
Expand Down Expand Up @@ -1203,28 +1222,20 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
if (s.ok()) {
assert(final_output_level > 0);
// bottommost level intra-level compaction
// TODO(cbi): this preserves earlier behavior where if
// max_overlapped_level = 0 and bottommost_level_compaction is
// kIfHaveCompactionFilter, we only do a L0 -> LBase compaction
// and do not do intra-LBase compaction even when user configures
// compaction filter. We may want to still do a LBase -> LBase
// compaction in case there is some file in LBase that did not go
// through L0 -> LBase compaction, and hence did not go through
// compaction filter.
if ((options.bottommost_level_compaction ==
BottommostLevelCompaction::kIfHaveCompactionFilter &&
max_overlapped_level != 0 &&
(cfd->ioptions()->compaction_filter != nullptr ||
cfd->ioptions()->compaction_filter_factory != nullptr)) ||
options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized ||
options.bottommost_level_compaction ==
BottommostLevelCompaction::kForce) {
// Use `next_file_number` as `max_file_num_to_ignore` to avoid
// rewriting newly compacted files when it is kForceOptimized.
// rewriting newly compacted files when it is kForceOptimized
// or kIfHaveCompactionFilter with compaction filter set.
s = RunManualCompaction(
cfd, final_output_level, final_output_level, options, begin,
end, exclusive, !trim_ts.empty() /* disallow_trivial_move */,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intra-level compaction does not allow trivial move anyway.

end, exclusive, true /* disallow_trivial_move */,
next_file_number /* max_file_num_to_ignore */, trim_ts);
}
}
Expand Down
11 changes: 6 additions & 5 deletions db/db_test2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2063,6 +2063,7 @@ class PinL0IndexAndFilterBlocksTest
ASSERT_OK(Flush(1));
// move this table to L1
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
ASSERT_EQ(1, NumTableFilesAtLevel(1, 1));

// reset block cache
table_options.block_cache = NewLRUCache(64 * 1024);
Expand Down Expand Up @@ -2220,7 +2221,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
// this should be read from L1
value = Get(1, "a");
if (!disallow_preload_) {
// In inifinite max files case, there's a cache miss in executing Get()
// In infinite max files case, there's a cache miss in executing Get()
// because index and filter are not prefetched before.
ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
Expand Down Expand Up @@ -2248,12 +2249,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we no longer read more levels to compute max_overlapped_level. Same for the changes below.

ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
} else {
ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
}

// Bloom and index hit will happen when a Get() happens.
Expand All @@ -2262,12 +2263,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
} else {
ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
}
}

Expand Down
6 changes: 3 additions & 3 deletions db/manual_compaction_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,9 @@ TEST_F(ManualCompactionTest, SkipLevel) {
filter->Reset();
ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
ASSERT_EQ(4, filter->NumKeys());
// 1 is first compacted to L1 and then further compacted into [2, 4, 8],
// so finally the logged level for 1 is L1.
ASSERT_EQ(1, filter->KeyLevel("1"));
// 1 is first compacted from L0 to L1, and then L1 intra level compaction
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New behavior for kIfHaveCompactionFilter.

// compacts [2, 4, 8] only.
ASSERT_EQ(0, filter->KeyLevel("1"));
ASSERT_EQ(1, filter->KeyLevel("2"));
ASSERT_EQ(1, filter->KeyLevel("4"));
ASSERT_EQ(1, filter->KeyLevel("8"));
Expand Down
3 changes: 2 additions & 1 deletion db/seqno_time_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
}
ASSERT_OK(Flush());
}
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// Second to last level
MoveFilesToLevel(5);
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);

Expand Down
13 changes: 8 additions & 5 deletions include/rocksdb/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -1817,15 +1817,18 @@ struct CompactionOptions {
// For level based compaction, we can configure if we want to skip/force
// bottommost level compaction.
enum class BottommostLevelCompaction {
// Skip bottommost level compaction
// Skip bottommost level compaction.
kSkip,
// Only compact bottommost level if there is a compaction filter
// This is the default option
// Only compact bottommost level if there is a compaction filter.
// This is the default option.
// Similar to kForceOptimized, when compacting bottommost level, avoid
// double-compacting files
// created in the same manual compaction.
kIfHaveCompactionFilter,
// Always compact bottommost level
// Always compact bottommost level.
kForce,
// Always compact bottommost level but in bottommost level avoid
// double-compacting files created in the same compaction
// double-compacting files created in the same compaction.
kForceOptimized,
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468)
For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468)