Skip to content

Commit

Permalink
db: add OnlyReadGuaranteedDurable to IterOptions
Browse files Browse the repository at this point in the history
This is only supported for Iterators created on the DB, and
excludes data in the memtable. This will be used for
cockroachdb/cockroach#36262
which is a prerequisite for separating the state machine
into a different DB.

Note that RocksDB supports such behavior using a ReadTier
setting equal to kPersistedTier, which this PR does not adopt
because it was considered too flexible (and has limitations
like not supporting iterators). See
https://github.com/facebook/rocksdb/blob/f6d7ec1d02de1fa84eff61b7ac5a3c663bd63cd7/include/rocksdb/options.h#L1394-L1408
https://github.com/facebook/rocksdb/blob/f6d7ec1d02de1fa84eff61b7ac5a3c663bd63cd7/include/rocksdb/options.h#L1467-L1471

Additionally, if the exclusion of memtables is an implementation
decision, as outlined in the comment in IterOptions, it allows
us more flexibility in implementation in the future.
  • Loading branch information
sumeerbhola committed Feb 9, 2022
1 parent db46dab commit 616b945
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 1 deletion.
2 changes: 2 additions & 0 deletions commit.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,8 @@ func (p *commitPipeline) prepare(b *Batch, syncWAL bool) (*memTable, error) {
if syncWAL {
count++
}
// count represents the waiting needed for publish, and optionally the
// waiting needed for the WAL sync.
b.commit.Add(count)

var syncWG *sync.WaitGroup
Expand Down
11 changes: 10 additions & 1 deletion db.go
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,13 @@ func (d *DB) newIterInternal(batch *Batch, s *Snapshot, o *IterOptions) *Iterato
if o != nil && o.RangeKeyMasking.Suffix != nil && o.KeyTypes != IterKeyTypePointsAndRanges {
panic("pebble: range key masking requires IterKeyTypePointsAndRanges")
}

if (batch != nil || s != nil) && (o != nil && o.OnlyReadGuaranteedDurable) {
// We could add support for OnlyReadGuaranteedDurable on snapshots if
// there was a need: this would require checking that the sequence number
// of the snapshot has been flushed, by comparing with
// DB.mem.queue[0].logSeqNum.
panic("OnlyReadGuaranteedDurable is not supported for batches or snapshots")
}
// Grab and reference the current readState. This prevents the underlying
// files in the associated version from being deleted if there is a current
// compaction. The readState is unref'd by Iterator.Close().
Expand Down Expand Up @@ -931,6 +937,9 @@ func finishInitializingIter(buf *iterAlloc) *Iterator {
batch := dbi.batch
seqNum := dbi.seqNum
memtables := readState.memtables
if dbi.opts.OnlyReadGuaranteedDurable {
memtables = nil
}
current := readState.current

// Merging levels and levels from iterAlloc.
Expand Down
41 changes: 41 additions & 0 deletions iterator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1413,6 +1413,47 @@ func TestIteratorRandomizedBlockIntervalFilter(t *testing.T) {
require.Equal(t, 0, len(matchingKeyValues))
}

func TestIteratorGuaranteedDurable(t *testing.T) {
mem := vfs.NewMem()
opts := &Options{FS: mem}
d, err := Open("", opts)
require.NoError(t, err)
defer func() {
require.NoError(t, d.Close())
}()
iterOptions := IterOptions{OnlyReadGuaranteedDurable: true}
failFunc := func(t *testing.T, reader Reader) {
defer func() {
if r := recover(); r == nil {
require.Fail(t, "expected panic")
}
reader.Close()
}()
iter := reader.NewIter(&iterOptions)
defer iter.Close()
}
t.Run("snapshot", func(t *testing.T) {
failFunc(t, d.NewSnapshot())
})
t.Run("batch", func(t *testing.T) {
failFunc(t, d.NewIndexedBatch())
})
t.Run("db", func(t *testing.T) {
d.Set([]byte("k"), []byte("v"), nil)
foundKV := func(o *IterOptions) bool {
iter := d.NewIter(o)
defer iter.Close()
iter.SeekGE([]byte("k"))
return iter.Valid()
}
require.True(t, foundKV(nil))
require.False(t, foundKV(&iterOptions))
require.NoError(t, d.Flush())
require.True(t, foundKV(nil))
require.True(t, foundKV(&iterOptions))
})
}

func BenchmarkIteratorSeekGE(b *testing.B) {
m, keys := buildMemTable(b)
iter := &Iterator{
Expand Down
28 changes: 28 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,34 @@ type IterOptions struct {
// and point key iteration mode (IterKeyTypePointsAndRanges).
RangeKeyMasking RangeKeyMasking

// OnlyReadGuaranteedDurable is an advanced option that is only supported by
// the Reader implemented by DB. When set to true, only the guaranteed to be
// durable state is visible in the iterator.
// - This definition is made under the assumption that the FS implementation
// is providing a durability guarantee when data is synced.
// - The visible state represents a consistent point in the history of the
// DB.
// - The implementation is free to choose a conservative definition of what
// is guaranteed durable. For simplicity, the current implementation
// ignores memtables. A more sophisticated implementation could track the
// highest seqnum that is synced to the WAL and published and use that as
// the visible seqnum for an iterator. Note that the latter approach is
// not strictly better than the former since we can have DBs that are (a)
// synced more rarely than memtable flushes, (b) have no WAL. (a) is
// likely to be true in a future CockroachDB context where the DB
// containing the state machine may be rarely synced.
// NB: this current implementation relies on the fact that memtables are
// flushed in seqnum order, and any ingested sstables that happen to have a
// lower seqnum than a non-flushed memtable don't have any overlapping keys.
// This is the fundamental level invariant used in other code too, like when
// merging iterators.
//
// Semantically, using this option provides the caller a "snapshot" as of
// the time the most recent memtable was flushed. An alternate interface
// would be to add a NewSnapshot variant. Creating a snapshot is heavier
// weight than creating an iterator, so we have opted to support this
// iterator option.
OnlyReadGuaranteedDurable bool
// Internal options.
logger Logger
}
Expand Down

0 comments on commit 616b945

Please sign in to comment.