Skip to content

Commit

Permalink
Relax VersionStorageInfo::GetOverlappingInputs check
Browse files Browse the repository at this point in the history
Do not consider the range tombstone sentinel key as causing 2 adjacent
sstables in a level to overlap. When a range tombstone's end key is the
largest key in an sstable, the sstable's end key is set to a "sentinel"
value that is the smallest key in the next sstable with a sequence
number of kMaxSequenceNumber. This "sentinel" is guaranteed to not
overlap in internal-key space with the next sstable. Unfortunately,
GetOverlappingFiles uses user-keys to determine overlap and was thus
considering 2 adjacent sstables in a level to overlap if they were
separated by this sentinel key. This in turn would cause compactions to
be larger than necessary.

This previous behavior of GetOverlappingInputs was necessary due to the
following scenario:

  * Write a delete range [a, d).
  * After compaction, this deleted range may be added to multiple sst
    files: a.sst, b.sst, c.sst, but the boundaries of these sst files
    are [a, b), [b, c), [c, d).
  * When a.sst and b.sst reach the bottommost level, the delete range of
    the sst files will be dropped.
  * Write a new key in the range [a, c).
  * When the newly written key [a, c) reaches the bottommost level, its
    sequence number will be set to zero.
  * When the front c.sst compacts with the key in the range [a, c) the
    sequence number of that key is zero, and the key will be incorrectly
    dropped.

That scenario no longer occurs because we are truncating range deletion
tombstones to sstable boundaries when adding them to RangeDelAggregator.
  • Loading branch information
petermattis committed Jul 13, 2018
1 parent ce6b81d commit d6b0247
Show file tree
Hide file tree
Showing 6 changed files with 298 additions and 73 deletions.
104 changes: 98 additions & 6 deletions db/db_range_del_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -935,11 +935,14 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) {
}

TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
// make sure compaction treats files containing a split range deletion in the
// input level as an atomic unit. I.e., compacting any input-level file(s)
// containing a portion of the range deletion causes all other input-level
// files containing portions of that same range deletion to be included in the
// compaction.
// This test originally verified that compaction treated files containing a
// split range deletion in the input level as an atomic unit. I.e.,
// compacting any input-level file(s) containing a portion of the range
// deletion causes all other input-level files containing portions of that
// same range deletion to be included in the compaction. Range deletion
// tombstones are now truncated to sstable boundaries which removed the need
// for that behavior (which could lead to excessively large
// compactions).
const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
Options options = CurrentOptions();
options.compression = kNoCompression;
Expand Down Expand Up @@ -986,22 +989,111 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
if (i == 0) {
ASSERT_OK(db_->CompactFiles(
CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
ASSERT_EQ(0, NumTableFilesAtLevel(1));
} else if (i == 1) {
auto begin_str = Key(0), end_str = Key(1);
Slice begin = begin_str, end = end_str;
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
ASSERT_EQ(3, NumTableFilesAtLevel(1));
} else if (i == 2) {
ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
{{"max_bytes_for_level_base", "10000"}}));
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(1, NumTableFilesAtLevel(1));
}
ASSERT_EQ(0, NumTableFilesAtLevel(1));
ASSERT_GT(NumTableFilesAtLevel(2), 0);

db_->ReleaseSnapshot(snapshot);
}
}

TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
// Test the handling of the range-tombstone end-key as the
// upper-bound for an sstable.

const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
Options options = CurrentOptions();
options.compression = kNoCompression;
options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
options.memtable_factory.reset(
new SpecialSkipListFactory(2 /* num_entries_flush */));
options.target_file_size_base = kValueBytes;
options.disable_auto_compactions = true;

DestroyAndReopen(options);

// Create an initial sstable at L2:
// [key000000#1,1, key000000#1,1]
ASSERT_OK(Put(Key(0), ""));
ASSERT_OK(db_->Flush(FlushOptions()));
MoveFilesToLevel(2);
ASSERT_EQ(1, NumTableFilesAtLevel(2));

// A snapshot protects the range tombstone from dropping due to
// becoming obsolete.
const Snapshot* snapshot = db_->GetSnapshot();
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(0), Key(2 * kNumFilesPerLevel));

// Create 2 additional sstables in L0. Note that the first sstable
// contains the range tombstone.
// [key000000#3,1, key000004#72057594037927935,15]
// [key000001#5,1, key000002#6,1]
Random rnd(301);
std::string value = RandomString(&rnd, kValueBytes);
for (int j = 0; j < kNumFilesPerLevel; ++j) {
// Give files overlapping key-ranges to prevent a trivial move when we
// compact from L0 to L1.
ASSERT_OK(Put(Key(j), value));
ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
}
// Compact the 2 L0 sstables to L1, resulting in the following LSM. There
// are 2 sstables generated in L1 due to the target_file_size_base setting.
// L1:
// [key000000#3,1, key000002#72057594037927935,15]
// [key000002#6,1, key000004#72057594037927935,15]
// L2:
// [key000000#1,1, key000000#1,1]
MoveFilesToLevel(1);
ASSERT_EQ(2, NumTableFilesAtLevel(1));

{
// Compact the second sstable in L1:
// L1:
// [key000000#3,1, key000002#72057594037927935,15]
// L2:
// [key000000#1,1, key000000#1,1]
// [key000002#6,1, key000004#72057594037927935,15]
auto begin_str = Key(3);
const rocksdb::Slice begin = begin_str;
dbfull()->TEST_CompactRange(1, &begin, nullptr);
ASSERT_EQ(1, NumTableFilesAtLevel(1));
ASSERT_EQ(2, NumTableFilesAtLevel(2));
}

{
// Compact the first sstable in L1. This should be copacetic, but
// was previously resulting in overlapping sstables in L2 due to
// mishandling of the range tombstone end-key when used as the
// largest key for an sstable. The resulting LSM structure should
// be:
//
// L2:
// [key000000#1,1, key000001#72057594037927935,15]
// [key000001#5,1, key000002#72057594037927935,15]
// [key000002#6,1, key000004#72057594037927935,15]
auto begin_str = Key(0);
const rocksdb::Slice begin = begin_str;
dbfull()->TEST_CompactRange(1, &begin, &begin);
ASSERT_EQ(0, NumTableFilesAtLevel(1));
ASSERT_EQ(3, NumTableFilesAtLevel(2));
}

db_->ReleaseSnapshot(snapshot);
}

TEST_F(DBRangeDelTest, UnorderedTombstones) {
// Regression test for #2752. Range delete tombstones between
// different snapshot stripes are not stored in order, so the first
Expand Down
16 changes: 13 additions & 3 deletions db/dbformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,14 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
return Slice(internal_key.data(), internal_key.size() - 8);
}

inline ValueType ExtractValueType(const Slice& internal_key) {
inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
return DecodeFixed64(internal_key.data() + n - 8);
}

inline ValueType ExtractValueType(const Slice& internal_key) {
uint64_t num = ExtractInternalKeyFooter(internal_key);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
Expand Down Expand Up @@ -601,9 +605,15 @@ struct RangeTombstone {
return InternalKey(start_key_, seq_, kTypeRangeDeletion);
}

// The tombstone end-key is exclusive, so we generate an internal-key here
// which has a similar property. Using kMaxSequenceNumber guarantees that
// the returned internal-key will compare less than any other internal-key
// with the same user-key. This in turn guarantees that the serialized
// end-key for a tombstone such as [a-b] will compare less than the key "b".
//
// be careful to use SerializeEndKey(), allocates new memory
InternalKey SerializeEndKey() const {
return InternalKey(end_key_, seq_, kTypeRangeDeletion);
return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
}
};

Expand Down
7 changes: 7 additions & 0 deletions db/dbformat_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,13 @@ TEST_F(FormatTest, UpdateInternalKey) {
ASSERT_EQ(new_val_type, decoded.type);
}

TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
RangeTombstone t("a", "b", 2);
InternalKey k("b", 3, kTypeValue);
const InternalKeyComparator cmp(BytewiseComparator());
ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
}

} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down
Loading

0 comments on commit d6b0247

Please sign in to comment.