Skip to content

Commit

Permalink
ui, server: surface paused replicas in problem ranges,
Browse files Browse the repository at this point in the history
range report, and replication metrics

This change surfaces paused replicas in the problem ranges
page, in the range report, and as a new chart in the
replication metrics.

Release justification: low risk, high benefit changes to
existing functionality.

Resolves: cockroachdb#84489

Release note (ui change): surface paused replicas to range report,
problem ranges, and replication metrics pages.
  • Loading branch information
Santamaura committed Aug 19, 2022
1 parent 39a86d3 commit 2bd238a
Show file tree
Hide file tree
Showing 10 changed files with 61 additions and 1 deletion.
4 changes: 4 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -1309,6 +1309,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) |
| raft_log_too_large | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RaftDebugResponse-bool) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -1554,6 +1555,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) |
| raft_log_too_large | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RangesResponse-bool) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -3366,6 +3368,7 @@ Support status: [reserved](#support-status)
| quiescent_equals_ticking_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| raft_log_too_large_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| circuit_breaker_error_range_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |
| paused_replica_ids | [int64](#cockroach.server.serverpb.ProblemRangesResponse-int64) | repeated | | [reserved](#support-status) |



Expand Down Expand Up @@ -3758,6 +3761,7 @@ RangeProblems describes issues reported by a range. For internal use only.
| quiescent_equals_ticking | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | Quiescent ranges do not tick by definition, but we track this in two different ways and suspect that they're getting out of sync. If the replica's quiescent flag doesn't agree with the store's list of replicas that are ticking, warn about it. | [reserved](#support-status) |
| raft_log_too_large | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | When the raft log is too large, it can be a symptom of other issues. | [reserved](#support-status) |
| circuit_breaker_error | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |
| paused_followers | [bool](#cockroach.server.serverpb.RangeResponse-bool) | | | [reserved](#support-status) |



Expand Down
4 changes: 4 additions & 0 deletions docs/generated/swagger/spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,10 @@
"type": "boolean",
"x-go-name": "Overreplicated"
},
"paused_followers": {
"type": "boolean",
"x-go-name": "PausedFollowers"
},
"quiescent_equals_ticking": {
"description": "Quiescent ranges do not tick by definition, but we track this in\ntwo different ways and suspect that they're getting out of sync.\nIf the replica's quiescent flag doesn't agree with the store's\nlist of replicas that are ticking, warn about it.",
"type": "boolean",
Expand Down
5 changes: 5 additions & 0 deletions pkg/server/problem_ranges.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ func (s *statusServer) ProblemRanges(
problems.CircuitBreakerErrorRangeIDs =
append(problems.CircuitBreakerErrorRangeIDs, info.State.Desc.RangeID)
}
if info.Problems.PausedFollowers {
problems.PausedReplicaIDs =
append(problems.PausedReplicaIDs, info.State.Desc.RangeID)
}
}
sort.Sort(roachpb.RangeIDSlice(problems.UnavailableRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.RaftLeaderNotLeaseHolderRangeIDs))
Expand All @@ -143,6 +147,7 @@ func (s *statusServer) ProblemRanges(
sort.Sort(roachpb.RangeIDSlice(problems.QuiescentEqualsTickingRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.RaftLogTooLargeRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.CircuitBreakerErrorRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.PausedReplicaIDs))
response.ProblemsByNodeID[resp.nodeID] = problems
case <-ctx.Done():
return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error())
Expand Down
6 changes: 6 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ message RangeProblems {
// When the raft log is too large, it can be a symptom of other issues.
bool raft_log_too_large = 7;
bool circuit_breaker_error = 9;
bool paused_followers = 10;
}

// RangeStatistics describes statistics reported by a range. For internal use
Expand Down Expand Up @@ -1223,6 +1224,11 @@ message ProblemRangesResponse {
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
repeated int64 paused_replica_ids = 11 [
(gogoproto.customname) = "PausedReplicaIDs",
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
}
reserved 1 to 7;
// NodeID is the node that submitted all the requests.
Expand Down
1 change: 1 addition & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2045,6 +2045,7 @@ func (s *statusServer) rangesHelper(
QuiescentEqualsTicking: raftStatus != nil && metrics.Quiescent == metrics.Ticking,
RaftLogTooLarge: metrics.RaftLogTooLarge,
CircuitBreakerError: len(state.CircuitBreakerError) > 0,
PausedFollowers: metrics.PausedFollowerCount > 0,
},
LeaseStatus: metrics.LeaseStatus,
Quiescent: metrics.Quiescent,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,7 @@ export const CircuitBreakerTrippedEventsTooltip: React.FC = () => (
since the process started.
</div>
);

export const PausedFollowersTooltip: React.FC = () => (
<div>The number of nonessential followers that have replication paused.</div>
);
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
CircuitBreakerTrippedEventsTooltip,
CircuitBreakerTrippedReplicasTooltip,
LogicalBytesGraphTooltip,
PausedFollowersTooltip,
} from "src/views/cluster/containers/nodeGraphs/dashboards/graphTooltips";
import { cockroach } from "src/js/protos";
import TimeSeriesQueryAggregator = cockroach.ts.tspb.TimeSeriesQueryAggregator;
Expand Down Expand Up @@ -223,5 +224,22 @@ export default function (props: GraphDashboardProps) {
))}
</Axis>
</LineGraph>,
<LineGraph
title="Paused Followers"
sources={storeSources}
tooltip={PausedFollowersTooltip}
>
<Axis label="replicas">
{_.map(nodeIDs, nid => (
<Metric
key={nid}
name="cr.store.admission.raft.paused_replicas"
title={nodeDisplayName(nodesSummary, nid)}
sources={storeIDsForNode(nodesSummary, nid)}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,
];
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ const connectionTableColumns: ConnectionTableColumn[] = [
title: "Circuit breaker error",
extract: problem => problem.circuit_breaker_error_range_ids.length,
},
{
title: "Paused Replicas",
extract: problem => problem.paused_replica_ids.length,
},
{
title: "Total",
extract: problem => {
Expand All @@ -85,7 +89,8 @@ const connectionTableColumns: ConnectionTableColumn[] = [
problem.overreplicated_range_ids.length +
problem.quiescent_equals_ticking_range_ids.length +
problem.raft_log_too_large_range_ids.length +
problem.circuit_breaker_error_range_ids.length
problem.circuit_breaker_error_range_ids.length +
problem.paused_replica_ids.length
);
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ export class ProblemRanges extends React.Component<ProblemRangesProps, {}> {
problems={problems}
extract={problem => problem.circuit_breaker_error_range_ids}
/>
<ProblemRangeList
name="Paused Replicas"
problems={problems}
extract={problem => problem.paused_replica_ids}
/>
</div>
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ const rangeTableDisplayList: RangeTableRow[] = [
display: "Locality Info",
compareToLeader: false,
},
{
variable: "pausedFollowers",
display: "Paused Followers",
compareToLeader: false,
},
];

const rangeTableEmptyContent: RangeTableCellContent = {
Expand Down Expand Up @@ -899,6 +904,9 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
tier => `${tier.key}: ${tier.value}`,
),
})),
pausedFollowers: this.createContent(
info.state.paused_replicas?.join(", "),
),
});
});

Expand Down

0 comments on commit 2bd238a

Please sign in to comment.