Skip to content

Commit

Permalink
ui: display very large ranges in problematic ranges
Browse files Browse the repository at this point in the history
This commit adds a column in the DB Console's page problematic ranges
that displays the ranges that are too large. The threshold is set
to be: 8 * the current range max size.

Fixes: #127843

Epic: None

Release note: None
  • Loading branch information
iskettaneh committed Aug 15, 2024
1 parent c80a197 commit cb0219e
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 25 deletions.
4 changes: 4 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| raft_log_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |
| range_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -1575,6 +1576,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| raft_log_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |
| range_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -3408,6 +3410,7 @@ Support status: [reserved](#support-status)
| raft_log_too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| circuit_breaker_error_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| paused_replica_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |



Expand Down Expand Up @@ -3956,6 +3959,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| raft_log_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |
| range_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |



Expand Down
39 changes: 29 additions & 10 deletions pkg/kv/kvserver/replica_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type ReplicaMetrics struct {
Underreplicated bool
Overreplicated bool
RaftLogTooLarge bool
RangeTooLarge bool
BehindCount int64
PausedFollowerCount int64
PendingRaftProposalCount int64
Expand Down Expand Up @@ -109,6 +110,8 @@ func (r *Replica) Metrics(
lockTableMetrics: lockTableMetrics,
raftLogSize: r.mu.raftLogSize,
raftLogSizeTrusted: r.mu.raftLogSizeTrusted,
rangeSize: r.mu.state.Stats.Total(),
rangeSplitSize: r.splitSizeRLocked(),
qpUsed: qpUsed,
qpCapacity: qpCap,
paused: r.mu.pausedFollowers,
Expand Down Expand Up @@ -138,6 +141,8 @@ type calcReplicaMetricsInput struct {
lockTableMetrics concurrency.LockTableMetrics
raftLogSize int64
raftLogSizeTrusted bool
rangeSize int64
rangeSplitSize int64
qpUsed, qpCapacity int64 // quota pool used and capacity bytes
paused map[roachpb.ReplicaID]struct{}
pendingRaftProposalCount int64
Expand All @@ -164,8 +169,14 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
}
}

rangeCounter, unavailable, underreplicated, overreplicated := calcRangeCounter(
d.storeID, d.desc, d.leaseStatus, d.vitalityMap, d.conf.GetNumVoters(), d.conf.NumReplicas, d.clusterNodes)
const (
raftLogTooLargeMultiple = 4
rangeTooLargeMultiple = 8
)
largeRangeThreshold := rangeTooLargeMultiple * d.rangeSplitSize
rangeCounter, unavailable, underreplicated, overreplicated, tooLarge := calcRangeCounter(
d.storeID, d.desc, d.leaseStatus, d.vitalityMap, d.conf.GetNumVoters(), d.conf.NumReplicas,
d.clusterNodes, largeRangeThreshold, d.rangeSize)

// The raft leader computes the number of raft entries that replicas are
// behind.
Expand All @@ -176,7 +187,6 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
leaderPausedFollowerCount = int64(len(d.paused))
}

const raftLogTooLargeMultiple = 4
return ReplicaMetrics{
Leader: leader,
LeaseValid: validLease,
Expand All @@ -194,6 +204,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics {
Overreplicated: overreplicated,
RaftLogTooLarge: d.raftLogSizeTrusted &&
d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold,
RangeTooLarge: tooLarge,
BehindCount: leaderBehindCount,
PausedFollowerCount: leaderPausedFollowerCount,
PendingRaftProposalCount: d.pendingRaftProposalCount,
Expand All @@ -217,9 +228,10 @@ func calcQuotaPoolPercentUsed(qpUsed, qpCapacity int64) int64 {

// calcRangeCounter returns whether this replica is designated as the replica in
// the range responsible for range-level metrics, whether the range doesn't have
// a quorum of live voting replicas, and whether the range is currently
// a quorum of live voting replicas, whether the range is currently
// under-replicated (with regards to either the number of voting replicas or the
// number of non-voting replicas).
// number of non-voting replicas), and whether the range is considered too
// large.
//
// Note: we compute an estimated range count across the cluster by counting the
// leaseholder of each descriptor if it's live, otherwise the first live
Expand All @@ -232,7 +244,8 @@ func calcRangeCounter(
vitalityMap livenesspb.NodeVitalityMap,
numVoters, numReplicas int32,
clusterNodes int,
) (rangeCounter, unavailable, underreplicated, overreplicated bool) {
rangeTooLargeThreshold, rangeSize int64,
) (rangeCounter, unavailable, underreplicated, overreplicated, tooLarge bool) {
// If there is a live leaseholder (regardless of whether the lease is still
// valid) that leaseholder is responsible for range-level metrics.
if vitalityMap[leaseStatus.Lease.Replica.NodeID].IsLive(livenesspb.Metrics) {
Expand Down Expand Up @@ -267,6 +280,7 @@ func calcRangeCounter(
} else if neededVoters < liveVoters || neededNonVoters < liveNonVoters {
overreplicated = true
}
tooLarge = rangeSize > rangeTooLargeThreshold
}
return
}
Expand Down Expand Up @@ -351,17 +365,22 @@ func (r *Replica) needsRaftLogTruncationLocked() bool {
return checkRaftLog
}

func (r *Replica) splitSizeRLocked() (maxBytes int64) {
maxBytes = r.mu.conf.RangeMaxBytes
if r.mu.largestPreviousMaxRangeSizeBytes > maxBytes {
maxBytes = r.mu.largestPreviousMaxRangeSizeBytes
}
return maxBytes
}

// exceedsMultipleOfSplitSizeRLocked returns whether the current size of the
// range exceeds the max size times mult. If so, the bytes overage is also
// returned. Note that the max size is determined by either the current maximum
// size as dictated by the span config or a previous max size indicating that
// the max size has changed relatively recently and thus we should not
// backpressure for being over.
func (r *Replica) exceedsMultipleOfSplitSizeRLocked(mult float64) (exceeded bool, bytesOver int64) {
maxBytes := r.mu.conf.RangeMaxBytes
if r.mu.largestPreviousMaxRangeSizeBytes > maxBytes {
maxBytes = r.mu.largestPreviousMaxRangeSizeBytes
}
maxBytes := r.splitSizeRLocked()
size := r.mu.state.Stats.Total()
maxSize := int64(float64(maxBytes)*mult) + 1
if maxBytes <= 0 || size <= maxSize {
Expand Down
48 changes: 34 additions & 14 deletions pkg/kv/kvserver/replica_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
}))

{
ctr, down, under, over := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
ctr, down, under, over, _ := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
1000: livenesspb.FakeNodeVitality(true), // by NodeID
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)

require.True(t, ctr)
require.True(t, down)
Expand All @@ -66,9 +66,9 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
}

{
ctr, down, under, over := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
ctr, down, under, over, _ := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
1000: livenesspb.FakeNodeVitality(false),
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)

// Does not confuse a non-live entry for a live one. In other words,
// does not think that the liveness map has only entries for live nodes.
Expand All @@ -79,12 +79,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
}

{
ctr, down, under, over := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
ctr, down, under, over, _ := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
10: livenesspb.FakeNodeVitality(true),
100: livenesspb.FakeNodeVitality(true),
1000: livenesspb.FakeNodeVitality(true),
2000: livenesspb.FakeNodeVitality(true),
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)

require.True(t, ctr)
require.False(t, down)
Expand All @@ -94,12 +94,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {

{
// Single non-voter dead
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
10: livenesspb.FakeNodeVitality(true),
100: livenesspb.FakeNodeVitality(true),
1000: livenesspb.FakeNodeVitality(false),
2000: livenesspb.FakeNodeVitality(true),
}, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
}, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)

require.True(t, ctr)
require.False(t, down)
Expand All @@ -109,12 +109,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {

{
// All non-voters are dead, but range is not unavailable
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
10: livenesspb.FakeNodeVitality(true),
100: livenesspb.FakeNodeVitality(false),
1000: livenesspb.FakeNodeVitality(false),
2000: livenesspb.FakeNodeVitality(false),
}, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
}, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)

require.True(t, ctr)
require.False(t, down)
Expand All @@ -124,18 +124,38 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {

{
// More non-voters than needed
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{
10: livenesspb.FakeNodeVitality(true),
100: livenesspb.FakeNodeVitality(true),
1000: livenesspb.FakeNodeVitality(true),
2000: livenesspb.FakeNodeVitality(true),
}, 1 /* numVoters */, 3 /* numReplicas */, 4 /* clusterNodes */)
}, 1 /* numVoters */, 3 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)

require.True(t, ctr)
require.False(t, down)
require.False(t, under)
require.True(t, over)
}

{
// Range larger than the threshold.
ctr, _, _, _, large := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
1000: livenesspb.FakeNodeVitality(true), // by NodeID
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 1000 /*rangeTooLargeThreshold*/, 2000 /*rangeSize*/)

require.True(t, ctr)
require.True(t, large)
}

{
ctr, _, _, _, large := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{
1000: livenesspb.FakeNodeVitality(false),
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 1000 /*rangeTooLargeThreshold*/, 2000 /*rangeSize*/)
require.False(t, ctr)
// Only the node responsible for the range can report if the range is too
// large.
require.False(t, large)
}
}

func TestCalcRangeCounterLeaseHolder(t *testing.T) {
Expand Down Expand Up @@ -242,8 +262,8 @@ func TestCalcRangeCounterLeaseHolder(t *testing.T) {
for _, nodeID := range tc.liveNodes {
livenessMap[nodeID] = livenesspb.FakeNodeVitality(true)
}
ctr, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap,
3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)
ctr, _, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap,
3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /*rangeTooLargeThreshold*/, 0 /*rangeSize*/)
require.Equal(t, tc.expectCounter, ctr)
})
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/server/problem_ranges.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ func (s *systemStatusServer) ProblemRanges(
problems.PausedReplicaIDs =
append(problems.PausedReplicaIDs, info.State.Desc.RangeID)
}
if info.Problems.RangeTooLarge {
problems.TooLargeRangeIds =
append(problems.TooLargeRangeIds, info.State.Desc.RangeID)
}
}
slices.Sort(problems.UnavailableRangeIDs)
slices.Sort(problems.RaftLeaderNotLeaseHolderRangeIDs)
Expand All @@ -152,6 +156,7 @@ func (s *systemStatusServer) ProblemRanges(
slices.Sort(problems.RaftLogTooLargeRangeIDs)
slices.Sort(problems.CircuitBreakerErrorRangeIDs)
slices.Sort(problems.PausedReplicaIDs)
slices.Sort(problems.TooLargeRangeIds)
response.ProblemsByNodeID[resp.nodeID] = problems
case <-ctx.Done():
return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error())
Expand Down
6 changes: 6 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,7 @@ message RangeProblems {
bool raft_log_too_large = 7;
bool circuit_breaker_error = 9;
bool paused_followers = 10;
bool range_too_large = 11;
}

// RangeStatistics describes statistics reported by a range. For internal use
Expand Down Expand Up @@ -1324,6 +1325,11 @@ message ProblemRangesResponse {
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
repeated int64 too_large_range_ids = 12 [
(gogoproto.customname) = "TooLargeRangeIds",
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
}
reserved 1 to 7;
// NodeID is the node that submitted all the requests.
Expand Down
1 change: 1 addition & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2495,6 +2495,7 @@ func (s *systemStatusServer) rangesHelper(
NoLease: metrics.Leader && !metrics.LeaseValid && !metrics.Quiescent,
QuiescentEqualsTicking: raftStatus != nil && metrics.Quiescent == metrics.Ticking,
RaftLogTooLarge: metrics.RaftLogTooLarge,
RangeTooLarge: metrics.RangeTooLarge,
CircuitBreakerError: len(state.CircuitBreakerError) > 0,
PausedFollowers: metrics.PausedFollowerCount > 0,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ const connectionTableColumns: ConnectionTableColumn[] = [
title: "Paused Replicas",
extract: problem => problem.paused_replica_ids.length,
},
{
title: "Range Too Large",
extract: problem => problem.too_large_range_ids.length,
},
{
title: "Total",
extract: problem => {
Expand All @@ -95,7 +99,8 @@ const connectionTableColumns: ConnectionTableColumn[] = [
problem.quiescent_equals_ticking_range_ids.length +
problem.raft_log_too_large_range_ids.length +
problem.circuit_breaker_error_range_ids.length +
problem.paused_replica_ids.length
problem.paused_replica_ids.length +
problem.too_large_range_ids.length
);
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,11 @@ export class ProblemRanges extends React.Component<ProblemRangesProps, {}> {
problems={problems}
extract={problem => problem.paused_replica_ids}
/>
<ProblemRangeList
name="Range Too Large"
problems={problems}
extract={problem => problem.too_large_range_ids}
/>
</div>
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,9 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
if (problems.raft_log_too_large) {
results = concat(results, "Raft log too large");
}
if (problems.range_too_large) {
results = concat(results, "Range too large");
}
if (awaitingGC) {
results = concat(results, "Awaiting GC");
}
Expand Down

0 comments on commit cb0219e

Please sign in to comment.