Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
86407: ui, server: surface paused replicas in problem ranges, range report, and replication metrics r=Santamaura a=Santamaura

This change surfaces paused replicas in the problem ranges
page, in the range report, and as a new chart in the
replication metrics.

Release justification: low risk, high benefit changes to
existing functionality.

Resolves: cockroachdb#84489

Release note (ui change): surface paused replicas to range report,
problem ranges, and replication metrics pages.

Co-authored-by: Santamaura <[email protected]>
  • Loading branch information
craig[bot] and Santamaura committed Aug 23, 2022
2 parents 681f951 + 2bd238a commit 8888295
Show file tree
Hide file tree
Showing 10 changed files with 61 additions and 1 deletion.
4 changes: 4 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -1309,6 +1309,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) |
| raft_log_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -1554,6 +1555,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) |
| raft_log_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -3366,6 +3368,7 @@ Support status: [reserved](#support-status)
| quiescent_equals_ticking_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| raft_log_too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| circuit_breaker_error_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| paused_replica_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |



Expand Down Expand Up @@ -3758,6 +3761,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) |
| raft_log_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |



Expand Down
4 changes: 4 additions & 0 deletions docs/generated/swagger/spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -1245,6 +1245,10 @@
"type": "boolean",
"x-go-name": "Overreplicated"
},
"paused_followers": {
"type": "boolean",
"x-go-name": "PausedFollowers"
},
"quiescent_equals_ticking": {
"description": "Quiescent ranges do not tick by definition, but we track this in\ntwo different ways and suspect that they're getting out of sync.\nIf the replica's quiescent flag doesn't agree with the store's\nlist of replicas that are ticking, warn about it.",
"type": "boolean",
Expand Down
5 changes: 5 additions & 0 deletions pkg/server/problem_ranges.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ func (s *statusServer) ProblemRanges(
problems.CircuitBreakerErrorRangeIDs =
append(problems.CircuitBreakerErrorRangeIDs, info.State.Desc.RangeID)
}
if info.Problems.PausedFollowers {
problems.PausedReplicaIDs =
append(problems.PausedReplicaIDs, info.State.Desc.RangeID)
}
}
sort.Sort(roachpb.RangeIDSlice(problems.UnavailableRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.RaftLeaderNotLeaseHolderRangeIDs))
Expand All @@ -143,6 +147,7 @@ func (s *statusServer) ProblemRanges(
sort.Sort(roachpb.RangeIDSlice(problems.QuiescentEqualsTickingRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.RaftLogTooLargeRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.CircuitBreakerErrorRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.PausedReplicaIDs))
response.ProblemsByNodeID[resp.nodeID] = problems
case <-ctx.Done():
return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error())
Expand Down
6 changes: 6 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ message RangeProblems {
// When the raft log is too large, it can be a symptom of other issues.
bool raft_log_too_large = 7;
bool circuit_breaker_error = 9;
bool paused_followers = 10;
}

// RangeStatistics describes statistics reported by a range. For internal use
Expand Down Expand Up @@ -1223,6 +1224,11 @@ message ProblemRangesResponse {
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
repeated int64 paused_replica_ids = 11 [
(gogoproto.customname) = "PausedReplicaIDs",
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
}
reserved 1 to 7;
// NodeID is the node that submitted all the requests.
Expand Down
1 change: 1 addition & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2051,6 +2051,7 @@ func (s *statusServer) rangesHelper(
QuiescentEqualsTicking: raftStatus != nil && metrics.Quiescent == metrics.Ticking,
RaftLogTooLarge: metrics.RaftLogTooLarge,
CircuitBreakerError: len(state.CircuitBreakerError) > 0,
PausedFollowers: metrics.PausedFollowerCount > 0,
},
LeaseStatus: metrics.LeaseStatus,
Quiescent: metrics.Quiescent,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,7 @@ export const CircuitBreakerTrippedEventsTooltip: React.FC = () => (
since the process started.
</div>
);

export const PausedFollowersTooltip: React.FC = () => (
<div>The number of nonessential followers that have replication paused.</div>
);
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
CircuitBreakerTrippedEventsTooltip,
CircuitBreakerTrippedReplicasTooltip,
LogicalBytesGraphTooltip,
PausedFollowersTooltip,
} from "src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips";
import { cockroach } from "src/js/protos";
import TimeSeriesQueryAggregator = cockroach.ts.tspb.TimeSeriesQueryAggregator;
Expand Down Expand Up @@ -223,5 +224,22 @@ export default function (props: GraphDashboardProps) {
))}
</Axis>
</LineGraph>,
<LineGraph
title="Paused Followers"
sources={storeSources}
tooltip={PausedFollowersTooltip}
>
<Axis label="replicas">
{_.map(nodeIDs, nid => (
<Metric
key={nid}
name="cr.store.admission.raft.paused_replicas"
title={nodeDisplayName(nodesSummary, nid)}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,
];
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ const connectionTableColumns: ConnectionTableColumn[] = [
title: "Circuit breaker error",
extract: problem => problem.circuit_breaker_error_range_ids.length,
},
{
title: "Paused Replicas",
extract: problem => problem.paused_replica_ids.length,
},
{
title: "Total",
extract: problem => {
Expand All @@ -85,7 +89,8 @@ const connectionTableColumns: ConnectionTableColumn[] = [
problem.overreplicated_range_ids.length +
problem.quiescent_equals_ticking_range_ids.length +
problem.raft_log_too_large_range_ids.length +
problem.circuit_breaker_error_range_ids.length
problem.circuit_breaker_error_range_ids.length +
problem.paused_replica_ids.length
);
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ export class ProblemRanges extends React.Component<ProblemRangesProps, {}> {
problems={problems}
extract={problem => problem.circuit_breaker_error_range_ids}
/>
<ProblemRangeList
name="Paused Replicas"
problems={problems}
extract={problem => problem.paused_replica_ids}
/>
</div>
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ const rangeTableDisplayList: RangeTableRow[] = [
display: "Locality Info",
compareToLeader: false,
},
{
variable: "pausedFollowers",
display: "Paused Followers",
compareToLeader: false,
},
];

const rangeTableEmptyContent: RangeTableCellContent = {
Expand Down Expand Up @@ -899,6 +904,9 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
tier => `${tier.key}: ${tier.value}`,
),
})),
pausedFollowers: this.createContent(
info.state.paused_replicas?.join(", "),
),
});
});

Expand Down

0 comments on commit 8888295

Please sign in to comment.