Skip to content

Commit

Permalink
storage: add MVCCStats for range keys
Browse files Browse the repository at this point in the history
This patch adds `MVCCStats` tracking for range keys. It only considers
range tombstones for now, since the semantics of other range keys are
still unclear (e.g. how should overlapping range keys be interpreted?),
and will error if encountering non-tombstone range keys.

Two new fields are added to `MVCCStats`:

* `RangeKeyCount`: the number of individual defragmented logical range
  keys, regardless of any overlap with other range keys. Multiple versions
  count as separate range keys, even if they overlap exactly.

* `RangeKeyBytes`: the logical encoded byte size of all range keys,
  excluding value. This ignores fragmentation, and counts the key bounds
  separately for each version even if multiple range keys overlap exactly.
  Unlike point keys, which for historical reasons use a fixed-size
  timestamp contribution, this uses the actual variable-length timestamp
  encoding contribution.

`ComputeStatsForRange()` has been extended to calculate the above
quantities, and additionally account for range tombstones themselves in
`GCBytesAge` along with their effect on point keys.  However, these
statistics are not yet updated during range key mutations and GC, nor on
CRDB range splits and merges -- this will be addressed separately. All
relevant call sites have been updated to surface range keys for the MVCC
iterators passed to `ComputeStatsForRange()`.

Release note: None
  • Loading branch information
erikgrinaker committed Feb 22, 2022
1 parent 4f058c2 commit d58c6b4
Show file tree
Hide file tree
Showing 15 changed files with 350 additions and 55 deletions.
5 changes: 4 additions & 1 deletion pkg/kv/kvserver/batcheval/cmd_clear_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,10 @@ func computeStatsDelta(
// If we can't use the fast stats path, or race test is enabled,
// compute stats across the key span to be cleared.
if !fast || util.RaceEnabled {
iter := readWriter.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{UpperBound: to})
iter := readWriter.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{
KeyTypes: storage.IterKeyTypePointsAndRanges,
UpperBound: to,
})
computed, err := iter.ComputeStats(from, to, delta.LastUpdateNanos)
iter.Close()
if err != nil {
Expand Down
5 changes: 4 additions & 1 deletion pkg/kv/kvserver/batcheval/cmd_end_transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -1165,7 +1165,10 @@ func mergeTrigger(
{
ridPrefix := keys.MakeRangeIDReplicatedPrefix(merge.RightDesc.RangeID)
// NB: Range-ID local keys have no versions and no intents.
iter := batch.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{UpperBound: ridPrefix.PrefixEnd()})
iter := batch.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{
KeyTypes: storage.IterKeyTypePointsAndRanges,
UpperBound: ridPrefix.PrefixEnd(),
})
defer iter.Close()
sysMS, err := iter.ComputeStats(ridPrefix, ridPrefix.PrefixEnd(), 0 /* nowNanos */)
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/batcheval/cmd_truncate_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ func TruncateLog(
// bugs that let it diverge. It might be easier to compute the stats
// from scratch, stopping when 4mb (defaultRaftLogTruncationThreshold)
// is reached as at that point we'll truncate aggressively anyway.
//
// We do not use IterKeyTypePointsAndRanges since there should be no
// range keys here.
iter := readWriter.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{UpperBound: end})
defer iter.Close()
// We can pass zero as nowNanos because we're only interested in SysBytes.
Expand Down
6 changes: 4 additions & 2 deletions pkg/kv/kvserver/rditer/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ func ComputeStatsForRange(
var err error
for _, keyRange := range MakeReplicatedKeyRangesExceptLockTable(d) {
func() {
iter := reader.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind,
storage.IterOptions{UpperBound: keyRange.End})
iter := reader.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{
KeyTypes: storage.IterKeyTypePointsAndRanges,
UpperBound: keyRange.End,
})
defer iter.Close()

var msDelta enginepb.MVCCStats
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/kvserver/replica_consistency.go
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,8 @@ func (r *Replica) sha512(
var timestampBuf []byte
hasher := sha512.New()

visitor := func(unsafeKey storage.MVCCKey, unsafeValue []byte) error {
// TODO(erikgrinaker): add a range key visitor to hash range keys.
pointKeyVisitor := func(unsafeKey storage.MVCCKey, unsafeValue []byte) error {
// Rate Limit the scan through the range
if err := limiter.WaitN(ctx, int64(len(unsafeKey.Key)+len(unsafeValue))); err != nil {
return err
Expand Down Expand Up @@ -633,9 +634,8 @@ func (r *Replica) sha512(
for _, span := range rditer.MakeReplicatedKeyRangesExceptLockTable(&desc) {
iter := snap.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind,
storage.IterOptions{UpperBound: span.End})
spanMS, err := storage.ComputeStatsForRange(
iter, span.Start, span.End, 0 /* nowNanos */, visitor,
)
spanMS, err := storage.ComputeStatsForRangeWithVisitors(
iter, span.Start, span.End, 0 /* nowNanos */, pointKeyVisitor, nil /* rangeKeyVisitor */)
iter.Close()
if err != nil {
return nil, err
Expand Down
10 changes: 8 additions & 2 deletions pkg/storage/bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,10 @@ func runMVCCScan(ctx context.Context, b *testing.B, emk engineMaker, opts benchS
// Pull all of the sstables into the RocksDB cache in order to make the
// timings more stable. Otherwise, the first run will be penalized pulling
// data into the cache while later runs will not.
iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax})
iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{
KeyTypes: IterKeyTypePointsAndRanges,
UpperBound: roachpb.KeyMax,
})
_, _ = iter.ComputeStats(keys.LocalMax, roachpb.KeyMax, 0)
iter.Close()
}
Expand Down Expand Up @@ -1338,7 +1341,10 @@ func runMVCCComputeStats(ctx context.Context, b *testing.B, emk engineMaker, val
var stats enginepb.MVCCStats
var err error
for i := 0; i < b.N; i++ {
iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: roachpb.KeyMax})
iter := eng.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{
KeyTypes: IterKeyTypePointsAndRanges,
UpperBound: roachpb.KeyMax,
})
stats, err = iter.ComputeStats(keys.LocalMax, roachpb.KeyMax, 0)
iter.Close()
if err != nil {
Expand Down
11 changes: 6 additions & 5 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,12 @@ type MVCCIterator interface {
ValueProto(msg protoutil.Message) error
// ComputeStats scans the underlying engine from start to end keys and
// computes stats counters based on the values. This method is used after a
// range is split to recompute stats for each subrange. The start key is
// always adjusted to avoid counting local keys in the event stats are being
// recomputed for the first range (i.e. the one with start key == KeyMin).
// The nowNanos arg specifies the wall time in nanoseconds since the
// epoch and is used to compute the total age of all intents.
// range is split to recompute stats for each subrange. The nowNanos arg
// specifies the wall time in nanoseconds since the epoch and is used to
// compute the total age of intents and garbage.
//
// To account for intents and range keys, the iterator must be created with
// MVCCKeyAndIntentsIterKind and IterKeyTypePointsAndRanges.
ComputeStats(start, end roachpb.Key, nowNanos int64) (enginepb.MVCCStats, error)
// FindSplitKey finds a key from the given span such that the left side of
// the split is roughly targetSize bytes. The returned key will never be
Expand Down
14 changes: 10 additions & 4 deletions pkg/storage/enginepb/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,17 @@ func (t TxnMeta) Short() redact.SafeString {
}

// Total returns the range size as the sum of the key and value
// bytes. This includes all non-live keys and all versioned values.
// bytes. This includes all non-live keys and all versioned values,
// both for point and range keys.
func (ms MVCCStats) Total() int64 {
return ms.KeyBytes + ms.ValBytes
return ms.KeyBytes + ms.ValBytes + ms.RangeKeyBytes
}

// GCBytes is a convenience function which returns the number of gc bytes,
// that is the key and value bytes excluding the live bytes.
// that is the key and value bytes excluding the live bytes, both for
// point keys and range keys.
func (ms MVCCStats) GCBytes() int64 {
return ms.KeyBytes + ms.ValBytes - ms.LiveBytes
return ms.KeyBytes + ms.ValBytes + ms.RangeKeyBytes - ms.LiveBytes
}

// AvgIntentAge returns the average age of outstanding intents,
Expand Down Expand Up @@ -169,6 +171,8 @@ func (ms *MVCCStats) Add(oms MVCCStats) {
ms.ValCount += oms.ValCount
ms.IntentCount += oms.IntentCount
ms.SeparatedIntentCount += oms.SeparatedIntentCount
ms.RangeKeyCount += oms.RangeKeyCount
ms.RangeKeyBytes += oms.RangeKeyBytes
ms.SysBytes += oms.SysBytes
ms.SysCount += oms.SysCount
ms.AbortSpanBytes += oms.AbortSpanBytes
Expand Down Expand Up @@ -196,6 +200,8 @@ func (ms *MVCCStats) Subtract(oms MVCCStats) {
ms.ValCount -= oms.ValCount
ms.IntentCount -= oms.IntentCount
ms.SeparatedIntentCount -= oms.SeparatedIntentCount
ms.RangeKeyCount -= oms.RangeKeyCount
ms.RangeKeyBytes -= oms.RangeKeyBytes
ms.SysBytes -= oms.SysBytes
ms.SysCount -= oms.SysCount
ms.AbortSpanBytes -= oms.AbortSpanBytes
Expand Down
25 changes: 22 additions & 3 deletions pkg/storage/enginepb/mvcc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ message MVCCStats {
// intent_age is the cumulative age of the tracked intents.
// See the comment on MVCCStats.
optional sfixed64 intent_age = 2 [(gogoproto.nullable) = false];
// gc_bytes_age is the cumulative age of the non-live data (i.e.
// data included in key_bytes and val_bytes, but not live_bytes).
// gc_bytes_age is the cumulative age of the non-live data (i.e. data
// included in key_bytes, val_bytes, and range_key_bytes, but not live_bytes).
// See the comment on MVCCStats.
optional sfixed64 gc_bytes_age = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "GCBytesAge"];
// live_bytes is the number of bytes stored in keys and values which can in
Expand All @@ -177,7 +177,7 @@ message MVCCStats {
// live_count is the number of meta keys tracked under live_bytes.
optional sfixed64 live_count = 5 [(gogoproto.nullable) = false];
// key_bytes is the number of bytes stored in all non-system
// keys, including live, meta, old, and deleted keys.
// point keys, including live, meta, old, and deleted keys.
// Only meta keys really account for the "full" key; value
// keys only for the timestamp suffix.
optional sfixed64 key_bytes = 6 [(gogoproto.nullable) = false];
Expand All @@ -201,6 +201,25 @@ message MVCCStats {
// intents, so mixed-version clusters with nodes preceding this knowledge
// will always have a 0 value for this field.
optional sfixed64 separated_intent_count = 16 [(gogoproto.nullable) = false];
// range_key_count is the number of logical range keys tracked under
// range_key_bytes, disregarding fragmentation and overlap. All abutting range
// key fragments at the same timestamp are considered part of the same logical
// range key (assuming they have the same value). However, a range key that
// straddles a range split boundary will become two separate logical range
// keys (one in each range), and merge back to one when the ranges merge.
// Overlapping range keys (at different timestamps) are counted independently,
// because it is unclear how to otherwise handle partial overlap.
//
// MVCC range tombstones are EXPERIMENTAL, and will only be written when
// storage.CanUseExperimentalMVCCRangeTombstones() is enabled.
//
// NB: Currently, all range keys are MVCC range tombstones with no value.
// Therefore, these do not contribute to live_count nor live_bytes, and there
// is no range_val_bytes or range_val_count.
optional sfixed64 range_key_count = 17 [(gogoproto.nullable) = false];
// range_key_bytes is the encoded size of logical range keys, disregarding
// fragmentation and overlap.
optional sfixed64 range_key_bytes = 18 [(gogoproto.nullable) = false];

// sys_bytes is the number of bytes stored in system-local kv-pairs.
// This tracks the same quantity as (key_bytes + val_bytes), but
Expand Down
4 changes: 4 additions & 0 deletions pkg/storage/enginepb/mvcc3.proto
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ message MVCCStatsDelta {
sint64 intent_bytes = 10;
sint64 intent_count = 11;
sint64 separated_intent_count = 16;
sint64 range_key_count = 17;
sint64 range_key_bytes = 18;
sint64 sys_bytes = 12;
sint64 sys_count = 13;
sint64 abort_span_bytes = 15;
Expand Down Expand Up @@ -188,6 +190,8 @@ message MVCCPersistentStats {
int64 intent_bytes = 10;
int64 intent_count = 11;
int64 separated_intent_count = 16;
int64 range_key_count = 17;
int64 range_key_bytes = 18;
int64 sys_bytes = 12;
int64 sys_count = 13;
int64 abort_span_bytes = 15;
Expand Down
Loading

0 comments on commit d58c6b4

Please sign in to comment.