Skip to content

Commit

Permalink
Merge pull request facebook#24 from petermattis/pmattis/range-delete
Browse files Browse the repository at this point in the history
Forward port the various DeleteRange PRs
  • Loading branch information
petermattis authored Dec 7, 2018
2 parents d793039 + 1e342df commit dc30625
Show file tree
Hide file tree
Showing 22 changed files with 824 additions and 252 deletions.
208 changes: 192 additions & 16 deletions db/db_range_del_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,6 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_EQ(0, NumTableFilesAtLevel(0));
ASSERT_GT(NumTableFilesAtLevel(1), 0);
ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));

for (int i = 0; i < kNumFiles; ++i) {
for (int j = 0; j < kNumPerFile; ++j) {
Expand Down Expand Up @@ -807,23 +805,201 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
db_->ReleaseSnapshot(snapshot);
}

TEST_F(DBRangeDelTest, IteratorCoveredSst) {
// TODO(peter): generate multiple sstables with some being covered
// by tombstones and some that aren't.
db_->Put(WriteOptions(), "key", "val");
ASSERT_OK(db_->Flush(FlushOptions()));
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
TEST_F(DBRangeDelTest, IteratorRangeTombstoneOverlapsSstable) {
struct TestCase {
const Comparator* comparator;
std::vector<std::string> table_keys;
std::vector<Range> range_del;
std::vector<std::string> expected_keys;
};

ReadOptions read_opts;
auto* iter = db_->NewIterator(read_opts);
std::vector<TestCase> test_cases = {
// Range tombstone covers the sstable.
{
BytewiseComparator(),
{ "a", "b", "c" },
{ { "a", "d" } },
{ },
},
// Range tombstone overlaps the start of the sstable.
{
BytewiseComparator(),
{ "a", "b", "c" },
{ { "", "b" } },
{ "b", "c" },
},
// Empty range tombstone start key and empty key.
{
BytewiseComparator(),
{ "", "b", "c" },
{ { "", "b" } },
{ "b", "c" },
},
// Range tombstone overlaps the end of the sstable.
{
BytewiseComparator(),
{ "a", "b", "c" },
{ { "c", "d" } },
{ "a", "b" },
},
// Range tombstones overlap both the start and the end of the
// sstable.
{
BytewiseComparator(),
{ "a", "b", "c" },
{ { "", "b" }, { "c", "d" } },
{ "b" },
},
// Range tombstone overlaps the middle of the sstable.
{
BytewiseComparator(),
{ "a", "b", "c" },
{ { "b", "c" } },
{ "a", "c" },
},
// Multiple range tombstones overlapping the sstable.
{
BytewiseComparator(),
{ "a", "b", "c", "d", "e" },
{ { "b", "c" }, { "d", "e" } },
{ "a", "c", "e" },
},
// Reverse comparator with empty range tombstone end key.
{
ReverseBytewiseComparator(),
{ "b", "a", "" },
{ { "a", "" }, { "b", "a" } },
{ "" },
},
};
for (auto tc : test_cases) {
Options options = CurrentOptions();
options.comparator = tc.comparator;
DestroyAndReopen(options);

int count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
++count;
for (auto key : tc.table_keys) {
db_->Put(WriteOptions(), key, "");
}

ASSERT_OK(db_->Flush(FlushOptions()));
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);

for (auto d : tc.range_del) {
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), d.start, d.limit);
}

{
unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
std::vector<std::string> keys;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
keys.emplace_back(iter->key().ToString());
}
ASSERT_EQ(tc.expected_keys, keys);
}

{
unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
std::vector<std::string> keys;
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
keys.emplace_back(iter->key().ToString());
}
std::reverse(keys.begin(), keys.end());
ASSERT_EQ(tc.expected_keys, keys);
}

{
unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
for (auto key : tc.expected_keys) {
iter->Seek(key);
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(key, iter->key());
iter->SeekForPrev(key);
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(key, iter->key());
}
}
}
ASSERT_EQ(0, count);
delete iter;
}

TEST_F(DBRangeDelTest, IteratorRangeTombstoneMultipleBlocks) {
Options options = CurrentOptions();
BlockBasedTableOptions table_options;
table_options.block_size = 1024;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Reopen(options);

// Create one SST with blocks [a, b] and [c, d].
db_->Put(WriteOptions(), "a", std::string(512, 'x'));
db_->Put(WriteOptions(), "b", std::string(512, 'x'));
db_->Put(WriteOptions(), "c", std::string(512, 'x'));
db_->Put(WriteOptions(), "d", std::string(512, 'x'));
ASSERT_OK(db_->Flush(FlushOptions()));
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);

// This test exercises a bookkeeping issue discovered in the
// BlockBasedTableIterator. In short, a range tombstone spanning keys in
// multiple data blocks could cause the iterator to incorrectly reuse its
// cached data block when it in fact needed to fetch a new data block.
//
// The goal is to force BlockBasedTableIterator::FindKeyForward and
// BlockBasedTableIterator::FindKeyBackward to seek between data blocks, which
// occurs when Next or Prev advances to a key covered by a range tombstone.

// Add a range tombstone over [b, d). To reproduce the bug during reverse
// iteration, it is required that the tombstone's end key is not the first key
// in a block.
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "d");

// Test the forward case.
unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
// First, seek to d to put the [c, d] block in the cache.
iter->Seek("d");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("d", iter->key());
// Then, seek to a. This puts the [a, b] block in the cache.
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
// Advance to b. Since this key is covered by the [b, d) tombstone,
// FindKeyForward will need to seek to the next data block to find the next
// non-deleted key (d). With the bug present, the iterator would forget that
// it had put the [a, b] block in the cache, instead thinking that [c, d] was
// still in the cache. It would thus look for d in the [a, b] block and
// incorrectly declare that the key did not exist.
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("d", iter->key());

// Test the reverse case. Beware: this is not quite analogous to the forward
// case.
iter.reset(db_->NewIterator(ReadOptions()));
// First, seek to a to put the [a, b] block in the cache. To be perfectly
// analogous, we'd use SeekForPrev, but DBIter performs a Prev after every
// SeekForPrev. So SeekForPrev(a) winds up invalidating the internal
// BlockBasedTableIterator instead of warming its cache.
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
// Seek to d. This first puts the [c, d] block in the cache. As mentioned
// above, DBIter performs a Prev internally, so this also advances backwards
// over the [b, d) tombstone and puts [a, b] in the cache. (The result of this
// internal Prev is stashed away and not visible until the next user call to
// Prev.) As in the forward case, with the bug present, the iterator would
// forget that it had put the [a, b] block in the cache, instead thinking that
// [c, d] was still in the cache. It would thus look for a in the [c, d] block
// and incorrectly declare that the key did not exist.
//
// Note that if the tombstone were instead over [b, c), the bug would not
// occur. Advancing backwards over this tombstone would invalidate the cached
// data block's iterator, thus triggering a code path that correctly reloaded
// the [c,d] block into the cache.
iter->SeekForPrev("d");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("d", iter->key());
// Observe the result of the internal Prev by advancing backwards to a.
iter->Prev();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
}

#ifndef ROCKSDB_UBSAN_RUN
Expand Down
32 changes: 1 addition & 31 deletions db/db_test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "table/scoped_arena_iterator.h"
#include "util/compression.h"
#include "util/filename.h"
#include "util/mock_time_env.h"
#include "util/mutexlock.h"

#include "util/string_util.h"
Expand Down Expand Up @@ -582,37 +583,6 @@ class SpecialEnv : public EnvWrapper {
std::atomic<size_t> compaction_readahead_size_{};
};

class MockTimeEnv : public EnvWrapper {
public:
explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}

virtual Status GetCurrentTime(int64_t* time) override {
assert(time != nullptr);
assert(current_time_ <=
static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
*time = static_cast<int64_t>(current_time_);
return Status::OK();
}

virtual uint64_t NowMicros() override {
assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
return current_time_ * 1000000;
}

virtual uint64_t NowNanos() override {
assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
return current_time_ * 1000000000;
}

void set_current_time(uint64_t time) {
assert(time >= current_time_);
current_time_ = time;
}

private:
std::atomic<uint64_t> current_time_{0};
};

#ifndef ROCKSDB_LITE
class OnFileDeletionListener : public EventListener {
public:
Expand Down
57 changes: 57 additions & 0 deletions db/dbformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -674,5 +674,62 @@ int InternalKeyComparator::CompareKeySeq(const Slice& akey,
return r;
}

// A PartialRangeTombstone represents a range tombstone whose start and end keys
// may be infinite. It is notably returned by RangeDelAggregator::GetTombstone,
// where a PartialRangeTombstone with a nullptr start key represents a synthetic
// tombstone before the first real tombstone and a PartialRangeTombstone with a
// nullptr end key represents a synthetic tombstone after the last real range
// tombstone. Unlike RangeTombstones, PartialRangeTombstones are never
// serialized and stored to disk. They exist only in memory.
//
// start_key and end_key are set and retrieved with type Slice*, but
// PartialRangeTombstone makes its own copy of the pointed-to Slices, if they
// are not nullptr. It is therefore safe for a PartialRangeTombstone to outlive
// the Slices it is constructed with. Note, however, that it does not make a
// copy of the actual data pointed to by the Slices.
class PartialRangeTombstone {
public:
PartialRangeTombstone()
: start_key_valid_(false), end_key_valid_(false), seq_(0) {}

PartialRangeTombstone(const ParsedInternalKey* sk,
const ParsedInternalKey* ek,
SequenceNumber sq)
: seq_(sq) {
SetStartKey(sk);
SetEndKey(ek);
}

void SetStartKey(const ParsedInternalKey* sk) {
if (sk != nullptr) {
start_key_ = *sk;
start_key_valid_ = true;
} else {
start_key_valid_ = false;
}
}

void SetEndKey(const ParsedInternalKey* ek) {
if (ek != nullptr) {
end_key_ = *ek;
end_key_valid_ = true;
} else {
end_key_valid_ = false;
}
}

const ParsedInternalKey* start_key() const {
return start_key_valid_ ? &start_key_ : nullptr;
}
const ParsedInternalKey* end_key() const { return end_key_valid_ ? &end_key_ : nullptr; }
SequenceNumber seq() const { return seq_; }

private:
ParsedInternalKey start_key_;
ParsedInternalKey end_key_;
bool start_key_valid_;
bool end_key_valid_;
SequenceNumber seq_;
};

} // namespace rocksdb
Loading

0 comments on commit dc30625

Please sign in to comment.