From b0e1aaa039f0d516f666d3bc4be5a33a5adb04ce Mon Sep 17 00:00:00 2001 From: Ibrahim Kettaneh Date: Wed, 14 Aug 2024 15:43:53 -0400 Subject: [PATCH] ui: display very large ranges in problematic ranges This commit adds a column in the DB Console's page problematic ranges that displays the ranges that are too large. The threshold is set to be: 8 * the current range max size. Fixes: #127843 Epic: None Release note (ui change): Adding too large ranges to the problematic ranges page in DB Console. If a range is larger than twice the max range size, it will show up in the problematic ranges page. --- docs/generated/http/full.md | 4 ++ pkg/kv/kvserver/replica_metrics.go | 24 +++++++--- pkg/kv/kvserver/replica_metrics_test.go | 48 +++++++++++++------ pkg/server/problem_ranges.go | 5 ++ pkg/server/serverpb/status.proto | 6 +++ pkg/server/status.go | 1 + .../problemRanges/connectionsTable.tsx | 7 ++- .../containers/problemRanges/index.tsx | 5 ++ .../reports/containers/range/rangeTable.tsx | 3 ++ 9 files changed, 82 insertions(+), 21 deletions(-) diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md index f066f4641834..338d244cbedd 100644 --- a/docs/generated/http/full.md +++ b/docs/generated/http/full.md @@ -1326,6 +1326,7 @@ RangeProblems describes issues reported by a range. For internal use only. | raft_log_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) | | circuit_breaker_error | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) | | paused_followers | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) | +| range_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) | @@ -1575,6 +1576,7 @@ RangeProblems describes issues reported by a range. For internal use only. | raft_log_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) | | circuit_breaker_error | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) | | paused_followers | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) | +| range_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) | @@ -3408,6 +3410,7 @@ Support status: [reserved](#support-status) | raft_log_too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | | circuit_breaker_error_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | | paused_replica_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | +| too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) | @@ -3956,6 +3959,7 @@ RangeProblems describes issues reported by a range. For internal use only. | raft_log_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) | | circuit_breaker_error | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) | | paused_followers | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) | +| range_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) | diff --git a/pkg/kv/kvserver/replica_metrics.go b/pkg/kv/kvserver/replica_metrics.go index 6b785da3cd18..b6aac79f1a7e 100644 --- a/pkg/kv/kvserver/replica_metrics.go +++ b/pkg/kv/kvserver/replica_metrics.go @@ -52,6 +52,7 @@ type ReplicaMetrics struct { Underreplicated bool Overreplicated bool RaftLogTooLarge bool + RangeTooLarge bool BehindCount int64 PausedFollowerCount int64 PendingRaftProposalCount int64 @@ -109,6 +110,7 @@ func (r *Replica) Metrics( lockTableMetrics: lockTableMetrics, raftLogSize: r.mu.raftLogSize, raftLogSizeTrusted: r.mu.raftLogSizeTrusted, + rangeSize: r.mu.state.Stats.Total(), qpUsed: qpUsed, qpCapacity: qpCap, paused: r.mu.pausedFollowers, @@ -138,6 +140,7 @@ type calcReplicaMetricsInput struct { lockTableMetrics concurrency.LockTableMetrics raftLogSize int64 raftLogSizeTrusted bool + rangeSize int64 qpUsed, qpCapacity int64 // quota pool used and capacity bytes paused map[roachpb.ReplicaID]struct{} pendingRaftProposalCount int64 @@ -164,8 +167,14 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics { } } - rangeCounter, unavailable, underreplicated, overreplicated := calcRangeCounter( - d.storeID, d.desc, d.leaseStatus, d.vitalityMap, d.conf.GetNumVoters(), d.conf.NumReplicas, d.clusterNodes) + const ( + raftLogTooLargeMultiple = 4 + rangeTooLargeMultiple = 2 + ) + largeRangeThreshold := rangeTooLargeMultiple * d.conf.RangeMaxBytes + rangeCounter, unavailable, underreplicated, overreplicated, tooLarge := calcRangeCounter( + d.storeID, d.desc, d.leaseStatus, d.vitalityMap, d.conf.GetNumVoters(), d.conf.NumReplicas, + d.clusterNodes, largeRangeThreshold, d.rangeSize) // The raft leader computes the number of raft entries that replicas are // behind. @@ -176,7 +185,6 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics { leaderPausedFollowerCount = int64(len(d.paused)) } - const raftLogTooLargeMultiple = 4 return ReplicaMetrics{ Leader: leader, LeaseValid: validLease, @@ -194,6 +202,7 @@ func calcReplicaMetrics(d calcReplicaMetricsInput) ReplicaMetrics { Overreplicated: overreplicated, RaftLogTooLarge: d.raftLogSizeTrusted && d.raftLogSize > raftLogTooLargeMultiple*d.raftCfg.RaftLogTruncationThreshold, + RangeTooLarge: tooLarge, BehindCount: leaderBehindCount, PausedFollowerCount: leaderPausedFollowerCount, PendingRaftProposalCount: d.pendingRaftProposalCount, @@ -217,9 +226,10 @@ func calcQuotaPoolPercentUsed(qpUsed, qpCapacity int64) int64 { // calcRangeCounter returns whether this replica is designated as the replica in // the range responsible for range-level metrics, whether the range doesn't have -// a quorum of live voting replicas, and whether the range is currently +// a quorum of live voting replicas, whether the range is currently // under-replicated (with regards to either the number of voting replicas or the -// number of non-voting replicas). +// number of non-voting replicas), and whether the range is considered too +// large. // // Note: we compute an estimated range count across the cluster by counting the // leaseholder of each descriptor if it's live, otherwise the first live @@ -232,7 +242,8 @@ func calcRangeCounter( vitalityMap livenesspb.NodeVitalityMap, numVoters, numReplicas int32, clusterNodes int, -) (rangeCounter, unavailable, underreplicated, overreplicated bool) { + rangeTooLargeThreshold, rangeSize int64, +) (rangeCounter, unavailable, underreplicated, overreplicated, tooLarge bool) { // If there is a live leaseholder (regardless of whether the lease is still // valid) that leaseholder is responsible for range-level metrics. if vitalityMap[leaseStatus.Lease.Replica.NodeID].IsLive(livenesspb.Metrics) { @@ -267,6 +278,7 @@ func calcRangeCounter( } else if neededVoters < liveVoters || neededNonVoters < liveNonVoters { overreplicated = true } + tooLarge = rangeSize > rangeTooLargeThreshold } return } diff --git a/pkg/kv/kvserver/replica_metrics_test.go b/pkg/kv/kvserver/replica_metrics_test.go index c6f1c0d0b409..a5182f4ac1c6 100644 --- a/pkg/kv/kvserver/replica_metrics_test.go +++ b/pkg/kv/kvserver/replica_metrics_test.go @@ -55,9 +55,9 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) { })) { - ctr, down, under, over := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ + ctr, down, under, over, _ := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ 1000: livenesspb.FakeNodeVitality(true), // by NodeID - }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */) + }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) require.True(t, ctr) require.True(t, down) @@ -66,9 +66,9 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) { } { - ctr, down, under, over := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ + ctr, down, under, over, _ := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ 1000: livenesspb.FakeNodeVitality(false), - }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */) + }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) // Does not confuse a non-live entry for a live one. In other words, // does not think that the liveness map has only entries for live nodes. @@ -79,12 +79,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) { } { - ctr, down, under, over := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ + ctr, down, under, over, _ := calcRangeCounter(11, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ 10: livenesspb.FakeNodeVitality(true), 100: livenesspb.FakeNodeVitality(true), 1000: livenesspb.FakeNodeVitality(true), 2000: livenesspb.FakeNodeVitality(true), - }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */) + }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) require.True(t, ctr) require.False(t, down) @@ -94,12 +94,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) { { // Single non-voter dead - ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{ + ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{ 10: livenesspb.FakeNodeVitality(true), 100: livenesspb.FakeNodeVitality(true), 1000: livenesspb.FakeNodeVitality(false), 2000: livenesspb.FakeNodeVitality(true), - }, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */) + }, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) require.True(t, ctr) require.False(t, down) @@ -109,12 +109,12 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) { { // All non-voters are dead, but range is not unavailable - ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{ + ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{ 10: livenesspb.FakeNodeVitality(true), 100: livenesspb.FakeNodeVitality(false), 1000: livenesspb.FakeNodeVitality(false), 2000: livenesspb.FakeNodeVitality(false), - }, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */) + }, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) require.True(t, ctr) require.False(t, down) @@ -124,18 +124,38 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) { { // More non-voters than needed - ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{ + ctr, down, under, over, _ := calcRangeCounter(11, oneVoterAndThreeNonVoters, leaseStatus, livenesspb.NodeVitalityMap{ 10: livenesspb.FakeNodeVitality(true), 100: livenesspb.FakeNodeVitality(true), 1000: livenesspb.FakeNodeVitality(true), 2000: livenesspb.FakeNodeVitality(true), - }, 1 /* numVoters */, 3 /* numReplicas */, 4 /* clusterNodes */) + }, 1 /* numVoters */, 3 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) require.True(t, ctr) require.False(t, down) require.False(t, under) require.True(t, over) } + + { + // Range larger than the threshold. + ctr, _, _, _, large := calcRangeCounter(1100, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ + 1000: livenesspb.FakeNodeVitality(true), // by NodeID + }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 1000 /* rangeTooLargeThreshold */, 2000 /* rangeSize */) + + require.True(t, ctr) + require.True(t, large) + } + + { + ctr, _, _, _, large := calcRangeCounter(1000, threeVotersAndSingleNonVoter, leaseStatus, livenesspb.NodeVitalityMap{ + 1000: livenesspb.FakeNodeVitality(false), + }, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 1000 /* rangeTooLargeThreshold */, 2000 /* rangeSize */) + require.False(t, ctr) + // Only the node responsible for the range can report if the range is too + // large. + require.False(t, large) + } } func TestCalcRangeCounterLeaseHolder(t *testing.T) { @@ -242,8 +262,8 @@ func TestCalcRangeCounterLeaseHolder(t *testing.T) { for _, nodeID := range tc.liveNodes { livenessMap[nodeID] = livenesspb.FakeNodeVitality(true) } - ctr, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap, - 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */) + ctr, _, _, _, _ := calcRangeCounter(tc.storeID, rangeDesc, tc.leaseStatus, livenessMap, + 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */, 0 /* rangeTooLargeThreshold */, 0 /* rangeSize */) require.Equal(t, tc.expectCounter, ctr) }) } diff --git a/pkg/server/problem_ranges.go b/pkg/server/problem_ranges.go index e0816963cf73..fce5fac7b897 100644 --- a/pkg/server/problem_ranges.go +++ b/pkg/server/problem_ranges.go @@ -141,6 +141,10 @@ func (s *systemStatusServer) ProblemRanges( problems.PausedReplicaIDs = append(problems.PausedReplicaIDs, info.State.Desc.RangeID) } + if info.Problems.RangeTooLarge { + problems.TooLargeRangeIds = + append(problems.TooLargeRangeIds, info.State.Desc.RangeID) + } } slices.Sort(problems.UnavailableRangeIDs) slices.Sort(problems.RaftLeaderNotLeaseHolderRangeIDs) @@ -152,6 +156,7 @@ func (s *systemStatusServer) ProblemRanges( slices.Sort(problems.RaftLogTooLargeRangeIDs) slices.Sort(problems.CircuitBreakerErrorRangeIDs) slices.Sort(problems.PausedReplicaIDs) + slices.Sort(problems.TooLargeRangeIds) response.ProblemsByNodeID[resp.nodeID] = problems case <-ctx.Done(): return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error()) diff --git a/pkg/server/serverpb/status.proto b/pkg/server/serverpb/status.proto index 55aeede28e55..9f04bb2a5d03 100644 --- a/pkg/server/serverpb/status.proto +++ b/pkg/server/serverpb/status.proto @@ -409,6 +409,7 @@ message RangeProblems { bool raft_log_too_large = 7; bool circuit_breaker_error = 9; bool paused_followers = 10; + bool range_too_large = 11; } // RangeStatistics describes statistics reported by a range. For internal use @@ -1324,6 +1325,11 @@ message ProblemRangesResponse { (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" ]; + repeated int64 too_large_range_ids = 12 [ + (gogoproto.customname) = "TooLargeRangeIds", + (gogoproto.casttype) = + "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" + ]; } reserved 1 to 7; // NodeID is the node that submitted all the requests. diff --git a/pkg/server/status.go b/pkg/server/status.go index b47dcf1cc9d3..401a8ab1dab7 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -2495,6 +2495,7 @@ func (s *systemStatusServer) rangesHelper( NoLease: metrics.Leader && !metrics.LeaseValid && !metrics.Quiescent, QuiescentEqualsTicking: raftStatus != nil && metrics.Quiescent == metrics.Ticking, RaftLogTooLarge: metrics.RaftLogTooLarge, + RangeTooLarge: metrics.RangeTooLarge, CircuitBreakerError: len(state.CircuitBreakerError) > 0, PausedFollowers: metrics.PausedFollowerCount > 0, }, diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx index 542637546e09..6374a5d99b4f 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/connectionsTable.tsx @@ -82,6 +82,10 @@ const connectionTableColumns: ConnectionTableColumn[] = [ title: "Paused Replicas", extract: problem => problem.paused_replica_ids.length, }, + { + title: "Range Too Large", + extract: problem => problem.too_large_range_ids.length, + }, { title: "Total", extract: problem => { @@ -95,7 +99,8 @@ const connectionTableColumns: ConnectionTableColumn[] = [ problem.quiescent_equals_ticking_range_ids.length + problem.raft_log_too_large_range_ids.length + problem.circuit_breaker_error_range_ids.length + - problem.paused_replica_ids.length + problem.paused_replica_ids.length + + problem.too_large_range_ids.length ); }, }, diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx index b56960793b8a..f3325bfc829b 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/problemRanges/index.tsx @@ -230,6 +230,11 @@ export class ProblemRanges extends React.Component { problems={problems} extract={problem => problem.paused_replica_ids} /> + problem.too_large_range_ids} + /> ); } diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx index d2ae58c44014..c0870dca5734 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx @@ -492,6 +492,9 @@ export default class RangeTable extends React.Component { if (problems.raft_log_too_large) { results = concat(results, "Raft log too large"); } + if (problems.range_too_large) { + results = concat(results, "Range too large"); + } if (awaitingGC) { results = concat(results, "Awaiting GC"); }