From b03de7b84ea5c087c7aeb56067248fbac24bb041 Mon Sep 17 00:00:00 2001 From: Nick Travers Date: Wed, 16 Feb 2022 19:46:39 -0800 Subject: [PATCH] *: set track smallest / largest keys separately in manifest Currently, only point keys are tracked in the manifest. With the addition of range keys, the bounds of an SSTable should be computed by considering the bounds of both the point keys and the range keys, and taking the smallest or largest across both types of key, respectively. Add four additional fields, `{Smallest,Largest}{Point,Range}Key`, to `manifest.FileMetadata` to separately track the point and range key bounds. The existing `Smallest` and `Largest` fields are used to track the bounds across both point and range keys. Update the existing calls sites that set the smallest and largest keys to set all three types of bounds: point keys, range keys and combined. --- compaction.go | 4 ++ flush_external.go | 18 +++--- ingest.go | 106 ++++++++++++++++++++++++++-------- ingest_test.go | 19 ++++-- internal/base/internal.go | 10 ++++ internal/manifest/version.go | 48 +++++++++++++-- internal/rangekey/rangekey.go | 32 +++++----- level_iter_test.go | 4 +- sstable/writer.go | 38 ++++++++++-- testdata/ingest_load | 43 ++++++++++++++ 10 files changed, 263 insertions(+), 59 deletions(-) diff --git a/compaction.go b/compaction.go index 7ee31b2194..3acce3196e 100644 --- a/compaction.go +++ b/compaction.go @@ -2259,6 +2259,10 @@ func (d *DB) runCompaction( ) } + meta.SmallestPointKey = writerMeta.SmallestPointKey(d.cmp) + meta.LargestPointKey = writerMeta.LargestPointKey(d.cmp) + meta.SmallestRangeKey = writerMeta.SmallestRangeKey + meta.LargestRangeKey = writerMeta.LargestRangeKey meta.Smallest = writerMeta.Smallest(d.cmp) meta.Largest = writerMeta.Largest(d.cmp) diff --git a/flush_external.go b/flush_external.go index 0882113c22..ccf7a0bbe0 100644 --- a/flush_external.go +++ b/flush_external.go @@ -30,13 +30,17 @@ func flushExternalTable(untypedDB interface{}, path string, originalMeta *fileMe d.mu.Unlock() m := &fileMetadata{ - FileNum: fileNum, - Size: originalMeta.Size, - CreationTime: time.Now().Unix(), - Smallest: originalMeta.Smallest, - Largest: originalMeta.Largest, - SmallestSeqNum: originalMeta.SmallestSeqNum, - LargestSeqNum: originalMeta.LargestSeqNum, + FileNum: fileNum, + Size: originalMeta.Size, + CreationTime: time.Now().Unix(), + SmallestPointKey: originalMeta.SmallestPointKey, + LargestPointKey: originalMeta.LargestPointKey, + SmallestRangeKey: originalMeta.SmallestRangeKey, + LargestRangeKey: originalMeta.LargestRangeKey, + Smallest: originalMeta.Smallest, + Largest: originalMeta.Largest, + SmallestSeqNum: originalMeta.SmallestSeqNum, + LargestSeqNum: originalMeta.LargestSeqNum, } // Hard link the sstable into the DB directory. diff --git a/ingest.go b/ingest.go index 6fe9f34193..82f372565e 100644 --- a/ingest.go +++ b/ingest.go @@ -13,6 +13,7 @@ import ( "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/private" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/sstable" "github.com/cockroachdb/pebble/vfs" ) @@ -84,8 +85,6 @@ func ingestLoad1( meta.FileNum = fileNum meta.Size = uint64(stat.Size()) meta.CreationTime = time.Now().Unix() - meta.Smallest = InternalKey{} - meta.Largest = InternalKey{} // Avoid loading into into the table cache for collecting stats if we // don't need to. If there are no range deletions, we have all the @@ -98,9 +97,7 @@ func ingestLoad1( // calculating stats before we can remove the original link. maybeSetStatsFromProperties(meta, &r.Properties) - smallestSet, largestSet := false, false - empty := true - + hasPoints := false { iter, err := r.NewIter(nil /* lower */, nil /* upper */) if err != nil { @@ -111,9 +108,7 @@ func ingestLoad1( if err := ingestValidateKey(opts, key); err != nil { return nil, err } - empty = false - meta.Smallest = key.Clone() - smallestSet = true + meta.SmallestPointKey = key.Clone() } if err := iter.Error(); err != nil { return nil, err @@ -122,9 +117,8 @@ func ingestLoad1( if err := ingestValidateKey(opts, key); err != nil { return nil, err } - empty = false - meta.Largest = key.Clone() - largestSet = true + meta.LargestPointKey = key.Clone() + hasPoints = true // Implies smallest point key was also set. } if err := iter.Error(); err != nil { return nil, err @@ -141,10 +135,9 @@ func ingestLoad1( if err := ingestValidateKey(opts, key); err != nil { return nil, err } - empty = false - if !smallestSet || - base.InternalCompare(opts.Comparer.Compare, meta.Smallest, *key) > 0 { - meta.Smallest = key.Clone() + if !hasPoints || + base.InternalCompare(opts.Comparer.Compare, meta.SmallestPointKey, *key) > 0 { + meta.SmallestPointKey = key.Clone() } } if err := iter.Error(); err != nil { @@ -154,18 +147,82 @@ func ingestLoad1( if err := ingestValidateKey(opts, key); err != nil { return nil, err } - empty = false end := base.MakeRangeDeleteSentinelKey(val) - if !largestSet || - base.InternalCompare(opts.Comparer.Compare, meta.Largest, end) < 0 { - meta.Largest = end.Clone() + if !hasPoints || + base.InternalCompare(opts.Comparer.Compare, meta.LargestPointKey, end) < 0 { + meta.LargestPointKey = end.Clone() + hasPoints = true // Implies smallest point key was also set. + } + } + } + + // Update the range-key bounds for the table. + var hasRanges bool + { + iter, err := r.NewRawRangeKeyIter() + if err != nil { + return nil, err + } + if iter != nil { + defer iter.Close() + if key, _ := iter.First(); key != nil { + if err := ingestValidateKey(opts, key); err != nil { + return nil, err + } + meta.SmallestRangeKey = key.Clone() + } + if err := iter.Error(); err != nil { + return nil, err + } + if key, value := iter.Last(); key != nil { + if err := ingestValidateKey(opts, key); err != nil { + return nil, err + } + // As range keys are fragmented, the end key of the last range key in + // the table provides the upper bound for the table. + end, _, ok := rangekey.DecodeEndKey(key.Kind(), value) + if !ok { + return nil, errors.Newf("pebble: could not decode range end key") + } + meta.LargestRangeKey = base.MakeRangeKeySentinelKey(end).Clone() + hasRanges = true // Implies smallest range key was also set. + } + if err := iter.Error(); err != nil { + return nil, err } } } - if empty { + if !hasPoints && !hasRanges { return nil, nil } + + // Compute the overall smallest / largest fields from the point and key + // ranges. + switch { + case !hasRanges: + // Table has only point keys. Use the point key bounds. + meta.Smallest = meta.SmallestPointKey.Clone() + meta.Largest = meta.LargestPointKey.Clone() + case !hasPoints: + // Table has only range key. Use the range key bounds. + meta.Smallest = meta.SmallestRangeKey.Clone() + meta.Largest = meta.LargestRangeKey.Clone() + default: + // Table has both points and ranges. Compute the bounds by considering both + // the point and range key bounds. + if base.InternalCompare(opts.Comparer.Compare, meta.SmallestPointKey, meta.SmallestRangeKey) < 0 { + meta.Smallest = meta.SmallestPointKey.Clone() + } else { + meta.Smallest = meta.SmallestRangeKey.Clone() + } + if base.InternalCompare(opts.Comparer.Compare, meta.LargestPointKey, meta.LargestRangeKey) > 0 { + meta.Largest = meta.LargestPointKey.Clone() + } else { + meta.Largest = meta.LargestRangeKey.Clone() + } + } + return meta, nil } @@ -314,12 +371,15 @@ func ingestMemtableOverlaps(cmp Compare, mem flushable, meta []*fileMetadata) bo } func ingestUpdateSeqNum(opts *Options, dirname string, seqNum uint64, meta []*fileMetadata) error { + // TODO(travesrs): Do we need to update the seqnums on the point and range + // smallest / largest keys? Or just the combined bounds for each table. for _, m := range meta { m.Smallest = base.MakeInternalKey(m.Smallest.UserKey, seqNum, m.Smallest.Kind()) // Don't update the seqnum for the largest key if that key is a range - // deletion sentinel key as doing so unintentionally extends the bounds of - // the table. - if m.Largest.Trailer != InternalKeyRangeDeleteSentinel { + // deletion sentinel key or a range key boundary, as doing so + // unintentionally extends the bounds of the table. + if m.Largest.Trailer != InternalKeyRangeDeleteSentinel && + m.Largest.Trailer != base.InternalKeyBoundaryRangeKey { m.Largest = base.MakeInternalKey(m.Largest.UserKey, seqNum, m.Largest.Kind()) } // Setting smallestSeqNum == largestSeqNum triggers the setting of diff --git a/ingest_test.go b/ingest_test.go index 1e5b7b34bc..a8ab33c0dd 100644 --- a/ingest_test.go +++ b/ingest_test.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/errorfs" "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/internal/rangekey" "github.com/cockroachdb/pebble/sstable" "github.com/cockroachdb/pebble/vfs" "github.com/kr/pretty" @@ -63,8 +64,14 @@ func TestIngestLoad(t *testing.T) { return fmt.Sprintf("malformed input: %s\n", data) } key := base.ParseInternalKey(data[:j]) - value := []byte(data[j+1:]) - if err := w.Add(key, value); err != nil { + if k := key.Kind(); rangekey.IsRangeKey(k) { + value := rangekey.ParseValue(k, data[j+1:]) + err = w.AddRangeKey(key, value) + } else { + value := []byte(data[j+1:]) + err = w.Add(key, value) + } + if err != nil { return err.Error() } } @@ -81,6 +88,8 @@ func TestIngestLoad(t *testing.T) { var buf bytes.Buffer for _, m := range meta { fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest) + fmt.Fprintf(&buf, " points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey) + fmt.Fprintf(&buf, " ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey) } return buf.String() @@ -131,8 +140,10 @@ func TestIngestLoadRand(t *testing.T) { return base.InternalCompare(cmp, keys[i], keys[j]) < 0 }) - expected[i].Smallest = keys[0] - expected[i].Largest = keys[len(keys)-1] + expected[i].SmallestPointKey = keys[0] + expected[i].LargestPointKey = keys[len(keys)-1] + expected[i].Smallest = expected[i].SmallestPointKey + expected[i].Largest = expected[i].LargestPointKey w := sstable.NewWriter(f, sstable.WriterOptions{}) var count uint64 diff --git a/internal/base/internal.go b/internal/base/internal.go index 0ee3d4efbf..7e589a053b 100644 --- a/internal/base/internal.go +++ b/internal/base/internal.go @@ -163,6 +163,16 @@ func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey { } } +// MakeRangeKeySentinelKey constructs an internal key that is a range key +// sentinel key, used as the upper boundary for an sstable when a range key is +// the largest key in an sstable. +func MakeRangeKeySentinelKey(userKey []byte) InternalKey { + return InternalKey{ + UserKey: userKey, + Trailer: InternalKeyBoundaryRangeKey, + } +} + var kindsMap = map[string]InternalKeyKind{ "DEL": InternalKeyKindDelete, "SINGLEDEL": InternalKeyKindSingleDelete, diff --git a/internal/manifest/version.go b/internal/manifest/version.go index 7219e0ce4d..d6b2691591 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -96,11 +96,25 @@ type FileMetadata struct { // UTC). For ingested sstables, this corresponds to the time the file was // ingested. CreationTime int64 - // Smallest and Largest are the inclusive bounds for the internal keys - // stored in the table. + // SmallestPointKey and LargestPointKey are the inclusive bounds for the + // internal point keys stored in the table. + SmallestPointKey InternalKey + LargestPointKey InternalKey + // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the + // internal range keys stored in the table. + SmallestRangeKey InternalKey + LargestRangeKey InternalKey + // Smallest and Largest are the inclusive bounds for the internal keys stored + // in the table, across both point and range keys. These values can be + // reconstructed from the respective point and range key fields. + // TODO(travers): Should these fields be derived via functions? Smallest InternalKey Largest InternalKey - // Smallest and largest sequence numbers in the table. + // Smallest and largest sequence numbers in the table, across both point and + // range keys. These values can be reconstructed from the respective point and + // range key fields. + // TODO(travers): Do we need separate fields for point / range key seqnums? + // TODO(travers): Should these fields be derived via functions? SmallestSeqNum uint64 LargestSeqNum uint64 // True if the file is actively being compacted. Protected by DB.mu. @@ -133,6 +147,24 @@ func (m *FileMetadata) String() string { // Validate validates the metadata for consistency with itself, returning an // error if inconsistent. func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error { + // Point key validation. + + if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 { + return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s", + errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey), + m.LargestPointKey.Pretty(formatKey)) + } + + // Range key validation. + + if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 { + return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s", + errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey), + m.LargestRangeKey.Pretty(formatKey)) + } + + // Combined range and point key validation. + if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 { return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s", errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey), @@ -466,10 +498,14 @@ func ParseVersionDebug( if err != nil { return nil, err } + smallest := base.ParsePrettyInternalKey(fields[1]) + largest := base.ParsePrettyInternalKey(fields[2]) files[level] = append(files[level], &FileMetadata{ - FileNum: base.FileNum(fileNum), - Smallest: base.ParsePrettyInternalKey(fields[1]), - Largest: base.ParsePrettyInternalKey(fields[2]), + FileNum: base.FileNum(fileNum), + SmallestPointKey: smallest, + LargestPointKey: largest, + Smallest: smallest, + Largest: largest, }) } } diff --git a/internal/rangekey/rangekey.go b/internal/rangekey/rangekey.go index 316c1fb23d..023bd4cbc4 100644 --- a/internal/rangekey/rangekey.go +++ b/internal/rangekey/rangekey.go @@ -312,13 +312,19 @@ func Parse(s string) (key base.InternalKey, value []byte) { panic("range key string representation missing key-value separator :") } startKey := base.ParseInternalKey(strings.TrimSpace(s[:sep])) + return startKey, ParseValue(startKey.Kind(), s[sep+1:]) +} - switch startKey.Kind() { +// ParseValue parses a string representation of a range key value into its +// serialized form. See Parse for the input string format. +// TODO(travers): tests. +func ParseValue(kind base.InternalKeyKind, s string) (value []byte) { + switch kind { case base.InternalKeyKindRangeKeySet: - openBracket := strings.IndexByte(s[sep:], '[') - closeBracket := strings.IndexByte(s[sep:], ']') - endKey := strings.TrimSpace(s[sep+1 : sep+openBracket]) - itemStrs := strings.Split(s[sep+openBracket+1:sep+closeBracket], ",") + openBracket := strings.IndexByte(s[:], '[') + closeBracket := strings.IndexByte(s[:], ']') + endKey := strings.TrimSpace(s[:openBracket]) + itemStrs := strings.Split(s[openBracket+1:closeBracket], ",") var suffixValues []SuffixValue for _, itemStr := range itemStrs { @@ -334,13 +340,13 @@ func Parse(s string) (key base.InternalKey, value []byte) { } value = make([]byte, EncodedSetValueLen([]byte(endKey), suffixValues)) EncodeSetValue(value, []byte(endKey), suffixValues) - return startKey, value + return value case base.InternalKeyKindRangeKeyUnset: - openBracket := strings.IndexByte(s[sep:], '[') - closeBracket := strings.IndexByte(s[sep:], ']') - endKey := strings.TrimSpace(s[sep+1 : sep+openBracket]) - itemStrs := strings.Split(s[sep+openBracket+1:sep+closeBracket], ",") + openBracket := strings.IndexByte(s[:], '[') + closeBracket := strings.IndexByte(s[:], ']') + endKey := strings.TrimSpace(s[:openBracket]) + itemStrs := strings.Split(s[openBracket+1:closeBracket], ",") var suffixes [][]byte for _, itemStr := range itemStrs { @@ -348,13 +354,13 @@ func Parse(s string) (key base.InternalKey, value []byte) { } value = make([]byte, EncodedUnsetValueLen([]byte(endKey), suffixes)) EncodeUnsetValue(value, []byte(endKey), suffixes) - return startKey, value + return value case base.InternalKeyKindRangeKeyDelete: - return startKey, []byte(strings.TrimSpace(s[sep+1:])) + return []byte(strings.TrimSpace(s)) default: - panic(fmt.Sprintf("key kind %q not a range key", startKey.Kind())) + panic(fmt.Sprintf("key kind %q not a range key", kind)) } } diff --git a/level_iter_test.go b/level_iter_test.go index d174707035..8280227cf3 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -238,8 +238,8 @@ func (lt *levelIterTest) runBuild(d *datadriven.TestData) string { lt.readers = append(lt.readers, r) lt.metas = append(lt.metas, &fileMetadata{ FileNum: fileNum, - Smallest: meta.Smallest(lt.cmp.Compare), - Largest: meta.Largest(lt.cmp.Compare), + Smallest: meta.SmallestPointKey(lt.cmp.Compare), + Largest: meta.LargestPointKey(lt.cmp.Compare), }) var buf bytes.Buffer diff --git a/sstable/writer.go b/sstable/writer.go index 1c8dcdb6dd..db8177aae8 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -48,8 +48,8 @@ func (m *WriterMetadata) updateSeqNum(seqNum uint64) { } } -// Smallest returns the smaller of SmallestPoint and SmallestRangeDel. -func (m *WriterMetadata) Smallest(cmp Compare) InternalKey { +// SmallestPointKey returns the smaller of SmallestPoint and SmallestRangeDel. +func (m *WriterMetadata) SmallestPointKey(cmp Compare) InternalKey { if m.SmallestPoint.UserKey == nil { return m.SmallestRangeDel } @@ -62,8 +62,8 @@ func (m *WriterMetadata) Smallest(cmp Compare) InternalKey { return m.SmallestRangeDel } -// Largest returns the larget of LargestPoint and LargestRangeDel. -func (m *WriterMetadata) Largest(cmp Compare) InternalKey { +// LargestPointKey returns the larger of LargestPoint and LargestRangeDel. +func (m *WriterMetadata) LargestPointKey(cmp Compare) InternalKey { if m.LargestPoint.UserKey == nil { return m.LargestRangeDel } @@ -76,6 +76,36 @@ func (m *WriterMetadata) Largest(cmp Compare) InternalKey { return m.LargestRangeDel } +// Smallest returns the smaller of SmallestPointKey and SmallestRangeKey. +func (m *WriterMetadata) Smallest(cmp Compare) InternalKey { + point := m.SmallestPointKey(cmp) + if point.UserKey == nil { + return m.SmallestRangeKey + } + if m.SmallestRangeKey.UserKey == nil { + return point + } + if base.InternalCompare(cmp, point, m.SmallestRangeKey) < 0 { + return point + } + return m.SmallestRangeKey +} + +// Largest returns the larger of LargestPointKey and LargestRangeKey. +func (m *WriterMetadata) Largest(cmp Compare) InternalKey { + point := m.LargestPointKey(cmp) + if point.UserKey == nil { + return m.LargestRangeKey + } + if m.LargestRangeKey.UserKey == nil { + return point + } + if base.InternalCompare(cmp, point, m.LargestRangeKey) > 0 { + return point + } + return m.LargestRangeKey +} + type flusher interface { Flush() error } diff --git a/testdata/ingest_load b/testdata/ingest_load index 7c8bce6cd0..a6ea98eee2 100644 --- a/testdata/ingest_load +++ b/testdata/ingest_load @@ -16,57 +16,77 @@ load a.SET.0: ---- 1: a#0,1-a#0,1 + points: a#0,1-a#0,1 + ranges: #0,0-#0,0 load a.SET.0: b.SET.0: ---- 1: a#0,1-b#0,1 + points: a#0,1-b#0,1 + ranges: #0,0-#0,0 load a.DEL.0: ---- 1: a#0,0-a#0,0 + points: a#0,0-a#0,0 + ranges: #0,0-#0,0 load a.DEL.0: b.DEL.0: ---- 1: a#0,0-b#0,0 + points: a#0,0-b#0,0 + ranges: #0,0-#0,0 load a.MERGE.0: ---- 1: a#0,2-a#0,2 + points: a#0,2-a#0,2 + ranges: #0,0-#0,0 load a.MERGE.0: b.MERGE.0: ---- 1: a#0,2-b#0,2 + points: a#0,2-b#0,2 + ranges: #0,0-#0,0 load a.RANGEDEL.0:b ---- 1: a#0,15-b#72057594037927935,15 + points: a#0,15-b#72057594037927935,15 + ranges: #0,0-#0,0 load a.SET.0: a.RANGEDEL.0:b ---- 1: a#0,15-b#72057594037927935,15 + points: a#0,15-b#72057594037927935,15 + ranges: #0,0-#0,0 load a.SET.0: a.RANGEDEL.0:b ---- 1: a#0,15-b#72057594037927935,15 + points: a#0,15-b#72057594037927935,15 + ranges: #0,0-#0,0 load b.SET.0: a.RANGEDEL.0:b ---- 1: a#0,15-b#0,1 + points: a#0,15-b#0,1 + ranges: #0,0-#0,0 # Loading tables at an unsupported table format results in an error. # Write a table at version 6 (Pebble,v2) into a DB at version 5 (Pebble,v1). @@ -74,3 +94,26 @@ load writer-version=6 db-version=5 a.SET.1: ---- pebble: table with format (Pebble,v2) unsupported at DB format major version 5, (Pebble,v1) + +# Tables with range keys only. + +load +a.RANGEKEYSET.0:z [(@1=foo)] +---- +1: a#0,21-z#72057594037927935,21 + points: #0,0-#0,0 + ranges: a#0,21-z#72057594037927935,21 + +# Tables with a mixture of point and range keys. + +load +a.SET.0: +b.SET.0: +c.SET.0: +w.RANGEKEYSET.0:x [(@1=foo)] +x.RANGEKEYSET.0:y [(@2=bar)] +y.RANGEKEYSET.0:z [(@3=baz)] +---- +1: a#0,1-z#72057594037927935,21 + points: a#0,1-c#0,1 + ranges: w#0,21-z#72057594037927935,21