From 1e6a2ed470516b431c2c19a92d6159a79dc2421c Mon Sep 17 00:00:00 2001 From: Tiger Date: Thu, 21 May 2020 12:12:25 +0530 Subject: [PATCH 1/6] use cache for storing block offsets Signed-off-by: Tiger --- table/builder_test.go | 4 +- table/iterator.go | 22 ++++----- table/table.go | 102 +++++++++++++++++++++++++++++++++--------- table/table_test.go | 19 ++++++++ 4 files changed, 112 insertions(+), 35 deletions(-) diff --git a/table/builder_test.go b/table/builder_test.go index 5158a159a..dfa7f63fb 100644 --- a/table/builder_test.go +++ b/table/builder_test.go @@ -118,8 +118,8 @@ func TestTableIndex(t *testing.T) { } // Ensure index is built correctly - require.Equal(t, blockCount, len(tbl.blockIndex)) - for i, ko := range tbl.blockIndex { + require.Equal(t, blockCount, tbl.noOfBlocks) + for i, ko := range tbl.readTableIndex().Offsets { require.Equal(t, ko.Key, blockFirstKeys[i]) } f.Close() diff --git a/table/iterator.go b/table/iterator.go index 574c3bc55..d860cea03 100644 --- a/table/iterator.go +++ b/table/iterator.go @@ -197,13 +197,13 @@ func (itr *Iterator) Valid() bool { } func (itr *Iterator) seekToFirst() { - numBlocks := len(itr.t.blockIndex) + numBlocks := itr.t.noOfBlocks if numBlocks == 0 { itr.err = io.EOF return } itr.bpos = 0 - block, err := itr.t.block(itr.bpos) + block, err := itr.t.block(itr.bpos, nil) if err != nil { itr.err = err return @@ -214,13 +214,13 @@ func (itr *Iterator) seekToFirst() { } func (itr *Iterator) seekToLast() { - numBlocks := len(itr.t.blockIndex) + numBlocks := itr.t.noOfBlocks if numBlocks == 0 { itr.err = io.EOF return } itr.bpos = numBlocks - 1 - block, err := itr.t.block(itr.bpos) + block, err := itr.t.block(itr.bpos, nil) if err != nil { itr.err = err return @@ -232,7 +232,7 @@ func (itr *Iterator) seekToLast() { func (itr *Iterator) seekHelper(blockIdx int, key []byte) { itr.bpos = blockIdx - block, err := itr.t.block(blockIdx) + block, err := itr.t.block(blockIdx, nil) if err != nil { itr.err = err return @@ -251,8 +251,8 @@ func (itr *Iterator) seekFrom(key []byte, whence int) { case current: } - idx := sort.Search(len(itr.t.blockIndex), func(idx int) bool { - ko := itr.t.blockIndex[idx] + idx := sort.Search(itr.t.noOfBlocks, func(idx int) bool { + ko := itr.t.blockIndex(idx) return y.CompareKeys(ko.Key, key) > 0 }) if idx == 0 { @@ -271,7 +271,7 @@ func (itr *Iterator) seekFrom(key []byte, whence int) { itr.seekHelper(idx-1, key) if itr.err == io.EOF { // Case 1. Need to visit block[idx]. - if idx == len(itr.t.blockIndex) { + if idx == itr.t.noOfBlocks { // If idx == len(itr.t.blockIndex), then input key is greater than ANY element of table. // There's nothing we can do. Valid() should return false as we seek to end of table. return @@ -299,13 +299,13 @@ func (itr *Iterator) seekForPrev(key []byte) { func (itr *Iterator) next() { itr.err = nil - if itr.bpos >= len(itr.t.blockIndex) { + if itr.bpos >= itr.t.noOfBlocks { itr.err = io.EOF return } if len(itr.bi.data) == 0 { - block, err := itr.t.block(itr.bpos) + block, err := itr.t.block(itr.bpos, nil) if err != nil { itr.err = err return @@ -333,7 +333,7 @@ func (itr *Iterator) prev() { } if len(itr.bi.data) == 0 { - block, err := itr.t.block(itr.bpos) + block, err := itr.t.block(itr.bpos, nil) if err != nil { itr.err = err return diff --git a/table/table.go b/table/table.go index 43406fa8c..47550a14a 100644 --- a/table/table.go +++ b/table/table.go @@ -98,9 +98,8 @@ type Table struct { tableSize int // Initialized in OpenTable, using fd.Stat(). bfLock sync.Mutex - blockIndex []*pb.BlockOffset - ref int32 // For file garbage collection. Atomic. - bf *z.Bloom // Nil if BfCache is set. + ref int32 // For file garbage collection. Atomic. + bf *z.Bloom // Nil if BfCache is set. mmap []byte // Memory mapped. @@ -116,6 +115,8 @@ type Table struct { IsInmemory bool // Set to true if the table is on level 0 and opened in memory. opt *Options + + noOfBlocks int // Total number of blocks. } // CompressionType returns the compression algorithm used for block compression. @@ -159,7 +160,7 @@ func (t *Table) DecrRef() error { return err } // Delete all blocks from the cache. - for i := range t.blockIndex { + for i := 0; i < t.noOfBlocks; i++ { t.opt.Cache.Del(t.blockCacheKey(i)) } // Delete bloom filter from the cache. @@ -311,11 +312,13 @@ func OpenInMemoryTable(data []byte, id uint64, opt *Options) (*Table, error) { } func (t *Table) initBiggestAndSmallest() error { - if err := t.readIndex(); err != nil { + var err error + var ko *pb.BlockOffset + if ko, err = t.readIndex(); err != nil { return errors.Wrapf(err, "failed to read index.") } - t.smallest = t.blockIndex[0].Key + t.smallest = ko.Key it2 := t.NewIterator(true) defer it2.Close() @@ -362,7 +365,9 @@ func (t *Table) readNoFail(off, sz int) []byte { return res } -func (t *Table) readIndex() error { +// readIndex reads the index and populate the necessary table fields and returns +// first block offset +func (t *Table) readIndex() (*pb.BlockOffset, error) { readPos := t.tableSize // Read checksum len from the last 4 bytes. @@ -370,7 +375,7 @@ func (t *Table) readIndex() error { buf := t.readNoFail(readPos, 4) checksumLen := int(y.BytesToU32(buf)) if checksumLen < 0 { - return errors.New("checksum length less than zero. Data corrupted") + return nil, errors.New("checksum length less than zero. Data corrupted") } // Read checksum. @@ -378,7 +383,7 @@ func (t *Table) readIndex() error { readPos -= checksumLen buf = t.readNoFail(readPos, checksumLen) if err := proto.Unmarshal(buf, expectedChk); err != nil { - return err + return nil, err } // Read index size from the footer. @@ -392,7 +397,7 @@ func (t *Table) readIndex() error { data := t.readNoFail(readPos, t.indexLen) if err := y.VerifyChecksum(data, expectedChk); err != nil { - return y.Wrapf(err, "failed to verify checksum for table: %s", t.Filename()) + return nil, y.Wrapf(err, "failed to verify checksum for table: %s", t.Filename()) } index := pb.TableIndex{} @@ -400,7 +405,7 @@ func (t *Table) readIndex() error { if t.shouldDecrypt() { var err error if data, err = t.decrypt(data); err != nil { - return y.Wrapf(err, + return nil, y.Wrapf(err, "Error while decrypting table index for the table %d in Table.readIndex", t.id) } } @@ -408,7 +413,7 @@ func (t *Table) readIndex() error { y.Check(err) t.estimatedSize = index.EstimatedSize - t.blockIndex = index.Offsets + t.noOfBlocks = len(index.Offsets) if t.opt.LoadBloomsOnOpen { t.bfLock.Lock() @@ -416,12 +421,50 @@ func (t *Table) readIndex() error { t.bfLock.Unlock() } - return nil + return index.Offsets[0], nil +} + +// blockIndex returns block index for the given block id. +func (t *Table) blockIndex(idx int) *pb.BlockOffset { + if t.opt.Cache != nil { + // Try to get the block index from cache. + key := t.blockIndexCacheKey(idx) + ko, ok := t.opt.Cache.Get(key) + if ok && ko != nil { + return ko.(*pb.BlockOffset) + } + } + // Read the block index from sst. + tblIndex := t.readTableIndex() + ko := tblIndex.Offsets[idx] + + if t.opt.Cache != nil { + // Set the block index in the cache. + key := t.blockIndexCacheKey(idx) + t.opt.Cache.Set(key, ko, calculateOffsetSize(ko)) + } + return ko +} + +func calculateOffsetSize(ko *pb.BlockOffset) int64 { + // 1 word = 8 bytes + var size int64 = 3*8 + // key array take 3 words + 1*8 + // offset and len takes 1 word + 3*8 + // XXX_unrecognized array takes 3 word. + 1*8 // so far 7 words, in order to round the slab we're adding one more word. + + // add key size. + size += int64(cap(ko.Key)) + // add XXX_unrecognized size. + size += int64(cap(ko.XXX_unrecognized)) + return size } -func (t *Table) block(idx int) (*block, error) { +// block returns block from the sst. If the key offset is nil, this fuction will +// obtain the block offset either from cache or sst. +func (t *Table) block(idx int, ko *pb.BlockOffset) (*block, error) { y.AssertTruef(idx >= 0, "idx=%d", idx) - if idx >= len(t.blockIndex) { + if idx >= t.noOfBlocks { return nil, errors.New("block out of index") } if t.opt.Cache != nil { @@ -431,7 +474,11 @@ func (t *Table) block(idx int) (*block, error) { return blk.(*block), nil } } - ko := t.blockIndex[idx] + + // Read the block index if it's nil + if ko == nil { + ko = t.blockIndex(idx) + } blk := &block{ offset: int(ko.Offset), } @@ -517,6 +564,12 @@ func (t *Table) blockCacheKey(idx int) []byte { return buf } +func (t *Table) blockIndexCacheKey(idx int) []byte { + y.AssertTrue(t.id < math.MaxUint32) + y.AssertTrue(uint32(idx) < math.MaxUint32) + return []byte(fmt.Sprintf("t-%d-b-%d", t.id, idx)) +} + // EstimatedSize returns the total size of key-values stored in this table (including the // disk space occupied on the value log). func (t *Table) EstimatedSize() uint64 { return t.estimatedSize } @@ -568,6 +621,14 @@ func (t *Table) DoesNotHave(hash uint64) bool { // along with the bloom filter. func (t *Table) readBloomFilter() (*z.Bloom, int) { // Read bloom filter from the SST. + index := t.readTableIndex() + bf, err := z.JSONUnmarshal(index.BloomFilter) + y.Check(err) + return bf, len(index.BloomFilter) +} + +// readTableIndex reads table index from the sst and returns its pb format. +func (t *Table) readTableIndex() *pb.TableIndex { data := t.readNoFail(t.indexStart, t.indexLen) index := pb.TableIndex{} var err error @@ -577,17 +638,14 @@ func (t *Table) readBloomFilter() (*z.Bloom, int) { y.Check(err) } y.Check(proto.Unmarshal(data, &index)) - - bf, err := z.JSONUnmarshal(index.BloomFilter) - y.Check(err) - return bf, len(index.BloomFilter) + return &index } // VerifyChecksum verifies checksum for all blocks of table. This function is called by // OpenTable() function. This function is also called inside levelsController.VerifyChecksum(). func (t *Table) VerifyChecksum() error { - for i, os := range t.blockIndex { - b, err := t.block(i) + for i, os := range t.readTableIndex().Offsets { + b, err := t.block(i, os) if err != nil { return y.Wrapf(err, "checksum validation failed for table: %s, block: %d, offset:%d", t.Filename(), i, os.Offset) diff --git a/table/table_test.go b/table/table_test.go index c85891dc1..17659afb8 100644 --- a/table/table_test.go +++ b/table/table_test.go @@ -31,6 +31,7 @@ import ( "github.com/cespare/xxhash" "github.com/dgraph-io/badger/v2/options" + "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto" "github.com/stretchr/testify/require" @@ -952,3 +953,21 @@ func TestDoesNotHaveRace(t *testing.T) { } wg.Wait() } + +var ko *pb.BlockOffset + +// Use this benchmark to manually verify block offset size calculation +func BenchmarkBlockOffsetSizeCalculation(b *testing.B) { + for i := 0; i < b.N; i++ { + ko = &pb.BlockOffset{ + Key: []byte{1, 23}, + } + } +} + +func TestBlockOffsetSizeCalculation(t *testing.T) { + // Empty struct testing. + require.Equal(t, calculateOffsetSize(&pb.BlockOffset{}), int64(64)) + // Testing with key bytes + require.Equal(t, calculateOffsetSize(&pb.BlockOffset{Key: []byte{1, 1}}), int64(66)) +} From e7988acf2707c516194aea5194dcfce68722e266 Mon Sep 17 00:00:00 2001 From: Tiger Date: Fri, 22 May 2020 17:49:16 +0530 Subject: [PATCH 2/6] minor Signed-off-by: Tiger --- options.go | 34 ++++++++++++---- table/iterator.go | 12 +++--- table/table.go | 96 +++++++++++++++++++++++++++------------------ table/table_test.go | 5 ++- 4 files changed, 92 insertions(+), 55 deletions(-) diff --git a/options.go b/options.go index e97ba4770..7c5844a0f 100644 --- a/options.go +++ b/options.go @@ -102,6 +102,9 @@ type Options struct { // ------------------------------ maxBatchCount int64 // max entries in batch maxBatchSize int64 // max batch size in bytes + + KeepBlockOffsetsInCache bool // if KeepBlockOffsetsInCache is true and cache is enabled then, + // block offsets are kept in cache. } // DefaultOptions sets a list of recommended options for good performance. @@ -157,19 +160,21 @@ func DefaultOptions(path string) Options { LogRotatesToFlush: 2, EncryptionKey: []byte{}, EncryptionKeyRotationDuration: 10 * 24 * time.Hour, // Default 10 days. + KeepBlockOffsetsInCache: false, } } func buildTableOptions(opt Options) table.Options { return table.Options{ - TableSize: uint64(opt.MaxTableSize), - BlockSize: opt.BlockSize, - BloomFalsePositive: opt.BloomFalsePositive, - LoadBloomsOnOpen: opt.LoadBloomsOnOpen, - LoadingMode: opt.TableLoadingMode, - ChkMode: opt.ChecksumVerificationMode, - Compression: opt.Compression, - ZSTDCompressionLevel: opt.ZSTDCompressionLevel, + TableSize: uint64(opt.MaxTableSize), + BlockSize: opt.BlockSize, + BloomFalsePositive: opt.BloomFalsePositive, + LoadBloomsOnOpen: opt.LoadBloomsOnOpen, + LoadingMode: opt.TableLoadingMode, + ChkMode: opt.ChecksumVerificationMode, + Compression: opt.Compression, + ZSTDCompressionLevel: opt.ZSTDCompressionLevel, + KeepBlockOffsetsInCache: opt.KeepBlockOffsetsInCache, } } @@ -631,3 +636,16 @@ func (opt Options) WithLoadBloomsOnOpen(b bool) Options { opt.LoadBloomsOnOpen = b return opt } + +// WithKeepBlockOffsetsInCache returns a new Option value with KeepBlockOffsetsInCache set to the +// given value. +// +// Badger uses block offsets to to find the blocks in the sst table. Blocks offset also contains +// starting key of block. Badger uses those keys to find the block to do an effective key iteration. +// This option is used to reduce inmemory usage of BlockOffsets. +// +// The default value of KeepBlockOffsetInCache is false. +func (opt Options) WithKeepBlockOffsetsInCache(val bool) Options { + opt.KeepBlockOffsetsInCache = val + return opt +} diff --git a/table/iterator.go b/table/iterator.go index d860cea03..43709b370 100644 --- a/table/iterator.go +++ b/table/iterator.go @@ -203,7 +203,7 @@ func (itr *Iterator) seekToFirst() { return } itr.bpos = 0 - block, err := itr.t.block(itr.bpos, nil) + block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return @@ -220,7 +220,7 @@ func (itr *Iterator) seekToLast() { return } itr.bpos = numBlocks - 1 - block, err := itr.t.block(itr.bpos, nil) + block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return @@ -232,7 +232,7 @@ func (itr *Iterator) seekToLast() { func (itr *Iterator) seekHelper(blockIdx int, key []byte) { itr.bpos = blockIdx - block, err := itr.t.block(blockIdx, nil) + block, err := itr.t.block(blockIdx) if err != nil { itr.err = err return @@ -252,7 +252,7 @@ func (itr *Iterator) seekFrom(key []byte, whence int) { } idx := sort.Search(itr.t.noOfBlocks, func(idx int) bool { - ko := itr.t.blockIndex(idx) + ko := itr.t.blockOffsets()[idx] return y.CompareKeys(ko.Key, key) > 0 }) if idx == 0 { @@ -305,7 +305,7 @@ func (itr *Iterator) next() { } if len(itr.bi.data) == 0 { - block, err := itr.t.block(itr.bpos, nil) + block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return @@ -333,7 +333,7 @@ func (itr *Iterator) prev() { } if len(itr.bi.data) == 0 { - block, err := itr.t.block(itr.bpos, nil) + block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return diff --git a/table/table.go b/table/table.go index 47550a14a..94c4eacd8 100644 --- a/table/table.go +++ b/table/table.go @@ -45,6 +45,13 @@ import ( const fileSuffix = ".sst" const intSize = int(unsafe.Sizeof(int(0))) +// 1 word = 8 bytes +// sizeOfOffsetStruct is the size of pb.BlockOffset +const sizeOfOffsetStruct int64 = 3*8 + // key array take 3 words + 1*8 + // offset and len takes 1 word + 3*8 + // XXX_unrecognized array takes 3 word. + 1*8 // so far 7 words, in order to round the slab we're adding one more word. + // Options contains configurable options for Table/Builder. type Options struct { // Options for Opening/Building Table. @@ -81,6 +88,10 @@ type Options struct { // When LoadBloomsOnOpen is set, bloom filters will be read only when they are accessed. // Otherwise they will be loaded on table open. LoadBloomsOnOpen bool + + // KeepBlockOffsetsInCache is set to true and cache is enabled. Then block offsets are + // kept in cache. + KeepBlockOffsetsInCache bool } // TableInterface is useful for testing. @@ -98,8 +109,9 @@ type Table struct { tableSize int // Initialized in OpenTable, using fd.Stat(). bfLock sync.Mutex - ref int32 // For file garbage collection. Atomic. - bf *z.Bloom // Nil if BfCache is set. + blockIndex []*pb.BlockOffset + ref int32 // For file garbage collection. Atomic. + bf *z.Bloom // Nil if BfCache is set. mmap []byte // Memory mapped. @@ -421,48 +433,53 @@ func (t *Table) readIndex() (*pb.BlockOffset, error) { t.bfLock.Unlock() } + if t.opt.KeepBlockOffsetsInCache && t.opt.Cache != nil { + t.opt.Cache.Set( + t.blockOffsetsCacheKey(), + index.Offsets, + calculateOffsetsSize(index.Offsets)) + + return index.Offsets[0], nil + } + + t.blockIndex = index.Offsets return index.Offsets[0], nil } -// blockIndex returns block index for the given block id. -func (t *Table) blockIndex(idx int) *pb.BlockOffset { - if t.opt.Cache != nil { - // Try to get the block index from cache. - key := t.blockIndexCacheKey(idx) - ko, ok := t.opt.Cache.Get(key) - if ok && ko != nil { - return ko.(*pb.BlockOffset) - } +// blockOffsets returns block offsets of this table. +func (t *Table) blockOffsets() []*pb.BlockOffset { + if !t.opt.KeepBlockOffsetsInCache || t.opt.Cache == nil { + return t.blockIndex } - // Read the block index from sst. - tblIndex := t.readTableIndex() - ko := tblIndex.Offsets[idx] - if t.opt.Cache != nil { - // Set the block index in the cache. - key := t.blockIndexCacheKey(idx) - t.opt.Cache.Set(key, ko, calculateOffsetSize(ko)) + val, ok := t.opt.Cache.Get(t.blockOffsetsCacheKey()) + if ok { + return val.([]*pb.BlockOffset) } - return ko + + ti := t.readTableIndex() + + t.opt.Cache.Set(t.blockOffsetsCacheKey(), ti.Offsets, calculateOffsetsSize(ti.Offsets)) + return ti.Offsets } -func calculateOffsetSize(ko *pb.BlockOffset) int64 { - // 1 word = 8 bytes - var size int64 = 3*8 + // key array take 3 words - 1*8 + // offset and len takes 1 word - 3*8 + // XXX_unrecognized array takes 3 word. - 1*8 // so far 7 words, in order to round the slab we're adding one more word. +// calculateOffsetsSize returns the size of *pb.BlockOffset array +func calculateOffsetsSize(offsets []*pb.BlockOffset) int64 { + totalSize := sizeOfOffsetStruct * int64(len(offsets)) - // add key size. - size += int64(cap(ko.Key)) - // add XXX_unrecognized size. - size += int64(cap(ko.XXX_unrecognized)) - return size + for _, ko := range offsets { + // add key size. + totalSize += int64(cap(ko.Key)) + // add XXX_unrecognized size. + totalSize += int64(cap(ko.XXX_unrecognized)) + } + // Add three words for array size. + return totalSize + 3*8 } // block returns block from the sst. If the key offset is nil, this fuction will // obtain the block offset either from cache or sst. -func (t *Table) block(idx int, ko *pb.BlockOffset) (*block, error) { +func (t *Table) block(idx int) (*block, error) { y.AssertTruef(idx >= 0, "idx=%d", idx) if idx >= t.noOfBlocks { return nil, errors.New("block out of index") @@ -476,9 +493,7 @@ func (t *Table) block(idx int, ko *pb.BlockOffset) (*block, error) { } // Read the block index if it's nil - if ko == nil { - ko = t.blockIndex(idx) - } + ko := t.blockOffsets()[idx] blk := &block{ offset: int(ko.Offset), } @@ -564,10 +579,13 @@ func (t *Table) blockCacheKey(idx int) []byte { return buf } -func (t *Table) blockIndexCacheKey(idx int) []byte { +// blockOffsetsCacheKey returns the cache key for block offsets. +func (t *Table) blockOffsetsCacheKey() []byte { y.AssertTrue(t.id < math.MaxUint32) - y.AssertTrue(uint32(idx) < math.MaxUint32) - return []byte(fmt.Sprintf("t-%d-b-%d", t.id, idx)) + buf := make([]byte, 4) + binary.BigEndian.PutUint32(buf, uint32(t.id)) + + return append([]byte("bo"), buf...) } // EstimatedSize returns the total size of key-values stored in this table (including the @@ -644,8 +662,8 @@ func (t *Table) readTableIndex() *pb.TableIndex { // VerifyChecksum verifies checksum for all blocks of table. This function is called by // OpenTable() function. This function is also called inside levelsController.VerifyChecksum(). func (t *Table) VerifyChecksum() error { - for i, os := range t.readTableIndex().Offsets { - b, err := t.block(i, os) + for i, os := range t.blockOffsets() { + b, err := t.block(i) if err != nil { return y.Wrapf(err, "checksum validation failed for table: %s, block: %d, offset:%d", t.Filename(), i, os.Offset) diff --git a/table/table_test.go b/table/table_test.go index 17659afb8..a39d88e5a 100644 --- a/table/table_test.go +++ b/table/table_test.go @@ -967,7 +967,8 @@ func BenchmarkBlockOffsetSizeCalculation(b *testing.B) { func TestBlockOffsetSizeCalculation(t *testing.T) { // Empty struct testing. - require.Equal(t, calculateOffsetSize(&pb.BlockOffset{}), int64(64)) + require.Equal(t, calculateOffsetsSize([]*pb.BlockOffset{&pb.BlockOffset{}}), int64(88)) // Testing with key bytes - require.Equal(t, calculateOffsetSize(&pb.BlockOffset{Key: []byte{1, 1}}), int64(66)) + require.Equal(t, calculateOffsetsSize([]*pb.BlockOffset{&pb.BlockOffset{Key: []byte{1, 1}}}), + int64(90)) } From 6950025defff4f3085b0183589377fda219b9cc1 Mon Sep 17 00:00:00 2001 From: Tiger Date: Fri, 22 May 2020 18:03:05 +0530 Subject: [PATCH 3/6] add test Signed-off-by: Tiger --- iterator_test.go | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/iterator_test.go b/iterator_test.go index f9fddf913..0b21f6446 100644 --- a/iterator_test.go +++ b/iterator_test.go @@ -125,7 +125,7 @@ func TestPickSortTables(t *testing.T) { } func TestIteratePrefix(t *testing.T) { - runBadgerTest(t, nil, func(t *testing.T, db *DB) { + testIteratorPrefix := func(t *testing.T, db *DB) { bkey := func(i int) []byte { return []byte(fmt.Sprintf("%04d", i)) } @@ -198,7 +198,22 @@ func TestIteratePrefix(t *testing.T) { for i := 0; i < n; i++ { require.Equal(t, 1, countOneKey(bkey(i))) } + } + + t.Run("With Default options", func(t *testing.T) { + runBadgerTest(t, nil, func(t *testing.T, db *DB) { + testIteratorPrefix(t, db) + }) }) + + t.Run("With Block Offsets in Cache", func(t *testing.T) { + opts := getTestOptions("") + opts = opts.WithKeepBlockOffsetsInCache(true) + runBadgerTest(t, &opts, func(t *testing.T, db *DB) { + testIteratorPrefix(t, db) + }) + }) + } // go test -v -run=XXX -bench=BenchmarkIterate -benchtime=3s From 445dede96fe2844eac148c7e2136593b04b5e1df Mon Sep 17 00:00:00 2001 From: Tiger Date: Tue, 26 May 2020 17:30:16 +0530 Subject: [PATCH 4/6] minor Signed-off-by: Tiger --- iterator_test.go | 18 +++++++++++++++++- options.go | 46 +++++++++++++++++++++++++++++++++++----------- table/table.go | 19 ++++++++++--------- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/iterator_test.go b/iterator_test.go index 0b21f6446..cbaf6bec1 100644 --- a/iterator_test.go +++ b/iterator_test.go @@ -208,7 +208,23 @@ func TestIteratePrefix(t *testing.T) { t.Run("With Block Offsets in Cache", func(t *testing.T) { opts := getTestOptions("") - opts = opts.WithKeepBlockOffsetsInCache(true) + opts = opts.WithKeepBlockIndicesInCache(true) + runBadgerTest(t, &opts, func(t *testing.T, db *DB) { + testIteratorPrefix(t, db) + }) + }) + + t.Run("With Block Offsets and Blocks in Cache", func(t *testing.T) { + opts := getTestOptions("") + opts = opts.WithKeepBlockIndicesInCache(true).WithKeepBlocksInCache(true) + runBadgerTest(t, &opts, func(t *testing.T, db *DB) { + testIteratorPrefix(t, db) + }) + }) + + t.Run("With Blocks in Cache", func(t *testing.T) { + opts := getTestOptions("") + opts = opts.WithKeepBlocksInCache(true) runBadgerTest(t, &opts, func(t *testing.T, db *DB) { testIteratorPrefix(t, db) }) diff --git a/options.go b/options.go index 7c5844a0f..824e5cace 100644 --- a/options.go +++ b/options.go @@ -93,6 +93,12 @@ type Options struct { // ChecksumVerificationMode decides when db should verify checksums for SSTable blocks. ChecksumVerificationMode options.ChecksumVerificationMode + // KeepBlockIndicesInCache decides whether to keep the block offsets in the cache or not. + KeepBlockIndicesInCache bool + + // KeepBlocksInCache decides whether to keep the sst blocks in the cache or not. + KeepBlocksInCache bool + // Transaction start and commit timestamps are managed by end-user. // This is only useful for databases built on top of Badger (like Dgraph). // Not recommended for most users. @@ -102,9 +108,6 @@ type Options struct { // ------------------------------ maxBatchCount int64 // max entries in batch maxBatchSize int64 // max batch size in bytes - - KeepBlockOffsetsInCache bool // if KeepBlockOffsetsInCache is true and cache is enabled then, - // block offsets are kept in cache. } // DefaultOptions sets a list of recommended options for good performance. @@ -160,7 +163,8 @@ func DefaultOptions(path string) Options { LogRotatesToFlush: 2, EncryptionKey: []byte{}, EncryptionKeyRotationDuration: 10 * 24 * time.Hour, // Default 10 days. - KeepBlockOffsetsInCache: false, + KeepBlocksInCache: false, + KeepBlockIndicesInCache: false, } } @@ -174,7 +178,8 @@ func buildTableOptions(opt Options) table.Options { ChkMode: opt.ChecksumVerificationMode, Compression: opt.Compression, ZSTDCompressionLevel: opt.ZSTDCompressionLevel, - KeepBlockOffsetsInCache: opt.KeepBlockOffsetsInCache, + KeepBlockIndicesInCache: opt.KeepBlockIndicesInCache, + KeepBlocksInCache: opt.KeepBlocksInCache, } } @@ -637,15 +642,34 @@ func (opt Options) WithLoadBloomsOnOpen(b bool) Options { return opt } -// WithKeepBlockOffsetsInCache returns a new Option value with KeepBlockOffsetsInCache set to the +// WithKeepBlockIndicesInCache returns a new Option value with KeepBlockOffsetInCache set to the // given value. // -// Badger uses block offsets to to find the blocks in the sst table. Blocks offset also contains -// starting key of block. Badger uses those keys to find the block to do an effective key iteration. -// This option is used to reduce inmemory usage of BlockOffsets. +// When this option is set badger will store the block offsets in a cache along with +// the blocks. The size of the cache is determined by the MaxBlockCacheSize option. +// When indices are stored in the cache, the read performance might be affected but +// the cache limits the amount of memory used by the indices. // // The default value of KeepBlockOffsetInCache is false. -func (opt Options) WithKeepBlockOffsetsInCache(val bool) Options { - opt.KeepBlockOffsetsInCache = val +func (opt Options) WithKeepBlockIndicesInCache(val bool) Options { + opt.KeepBlockIndicesInCache = val + + if val && opt.MaxCacheSize == 0 { + opt.MaxCacheSize = 100 << 20 + } + return opt +} + +// WithKeepBlocksInCache returns a new Option value with KeepBlocksInCache set to the +// given value. +// +// When this option is set badger will store the block in the cache. The default value of +// KeepBlocksInCache is false. +func (opt Options) WithKeepBlocksInCache(val bool) Options { + opt.KeepBlocksInCache = val + + if val && opt.MaxCacheSize == 0 { + opt.MaxCacheSize = 100 << 20 + } return opt } diff --git a/table/table.go b/table/table.go index bf3551916..749ae60bc 100644 --- a/table/table.go +++ b/table/table.go @@ -89,9 +89,11 @@ type Options struct { // Otherwise they will be loaded on table open. LoadBloomsOnOpen bool - // KeepBlockOffsetsInCache is set to true and cache is enabled. Then block offsets are - // kept in cache. - KeepBlockOffsetsInCache bool + // KeepBlockIndicesInCache decides whether to keep the block offsets in the cache or not. + KeepBlockIndicesInCache bool + + // KeepBlocksInCache decides whether to keep the block in the cache or not. + KeepBlocksInCache bool } // TableInterface is useful for testing. @@ -454,7 +456,7 @@ func (t *Table) readIndex() (*pb.BlockOffset, error) { t.bfLock.Unlock() } - if t.opt.KeepBlockOffsetsInCache && t.opt.Cache != nil { + if t.opt.KeepBlockIndicesInCache && t.opt.Cache != nil { t.opt.Cache.Set( t.blockOffsetsCacheKey(), index.Offsets, @@ -469,12 +471,11 @@ func (t *Table) readIndex() (*pb.BlockOffset, error) { // blockOffsets returns block offsets of this table. func (t *Table) blockOffsets() []*pb.BlockOffset { - if !t.opt.KeepBlockOffsetsInCache || t.opt.Cache == nil { + if !t.opt.KeepBlockIndicesInCache || t.opt.Cache == nil { return t.blockIndex } - val, ok := t.opt.Cache.Get(t.blockOffsetsCacheKey()) - if ok { + if val, ok := t.opt.Cache.Get(t.blockOffsetsCacheKey()); ok && val != nil { return val.([]*pb.BlockOffset) } @@ -506,7 +507,7 @@ func (t *Table) block(idx int) (*block, error) { if idx >= t.noOfBlocks { return nil, errors.New("block out of index") } - if t.opt.Cache != nil { + if t.opt.Cache != nil && t.opt.KeepBlocksInCache { key := t.blockCacheKey(idx) blk, ok := t.opt.Cache.Get(key) if ok && blk != nil { @@ -578,7 +579,7 @@ func (t *Table) block(idx int) (*block, error) { return nil, err } } - if t.opt.Cache != nil { + if t.opt.Cache != nil && t.opt.KeepBlocksInCache { key := t.blockCacheKey(idx) // incrRef should never return false here because we're calling it on a // new block with ref=1. From e8ab8df4f1d53e64ee6ffefb7e1bb7e4720f2895 Mon Sep 17 00:00:00 2001 From: Tiger Date: Wed, 27 May 2020 14:48:29 +0530 Subject: [PATCH 5/6] fix comment and allocation Signed-off-by: Tiger --- options.go | 15 +++++++++------ table/table.go | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/options.go b/options.go index 824e5cace..fedb7cd98 100644 --- a/options.go +++ b/options.go @@ -645,10 +645,10 @@ func (opt Options) WithLoadBloomsOnOpen(b bool) Options { // WithKeepBlockIndicesInCache returns a new Option value with KeepBlockOffsetInCache set to the // given value. // -// When this option is set badger will store the block offsets in a cache along with -// the blocks. The size of the cache is determined by the MaxBlockCacheSize option. -// When indices are stored in the cache, the read performance might be affected but -// the cache limits the amount of memory used by the indices. +// When this option is set badger will store the block offsets in a cache along with the blocks. +// The size of the cache is determined by the MaxCacheSize option.If the MaxCacheSize is set to +// zero, then MaxCacheSize is set to 100 mb. When indices are stored in the cache, the read +// performance might be affected but the cache limits the amount of memory used by the indices. // // The default value of KeepBlockOffsetInCache is false. func (opt Options) WithKeepBlockIndicesInCache(val bool) Options { @@ -663,8 +663,11 @@ func (opt Options) WithKeepBlockIndicesInCache(val bool) Options { // WithKeepBlocksInCache returns a new Option value with KeepBlocksInCache set to the // given value. // -// When this option is set badger will store the block in the cache. The default value of -// KeepBlocksInCache is false. +// When this option is set badger will store the block in the cache. The size of the cache is +// determined by the MaxCacheSize option.If the MaxCacheSize is set to zero, +// then MaxCacheSize is set to 100 mb. +// +// The default value of KeepBlocksInCache is false. func (opt Options) WithKeepBlocksInCache(val bool) Options { opt.KeepBlocksInCache = val diff --git a/table/table.go b/table/table.go index 749ae60bc..8b88a69a9 100644 --- a/table/table.go +++ b/table/table.go @@ -617,7 +617,7 @@ func (t *Table) blockCacheKey(idx int) []byte { // blockOffsetsCacheKey returns the cache key for block offsets. func (t *Table) blockOffsetsCacheKey() []byte { y.AssertTrue(t.id < math.MaxUint32) - buf := make([]byte, 4) + buf := make([]byte, 4, 6) binary.BigEndian.PutUint32(buf, uint32(t.id)) return append([]byte("bo"), buf...) From 81b0a5d99e99a8661ed0644a7fa5ea2b18dd4155 Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Fri, 29 May 2020 00:59:50 +0530 Subject: [PATCH 6/6] Increase test timeout from 15m to 25m This PR adds 3 new tests and each of them about 2m and so the test suite cannot complete in even 15 minutes. --- test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test.sh b/test.sh index 05c28ae88..06c9396c5 100755 --- a/test.sh +++ b/test.sh @@ -38,9 +38,9 @@ go test -v -race $packages echo echo "==> Starting tests with value log mmapped..." # Run top level package tests with mmap flag. -go test -timeout=15m -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=true +go test -timeout=25m -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=true echo echo "==> Starting tests with value log not mmapped..." -go test -timeout=15m -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=false +go test -timeout=25m -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=false