Skip to content

Commit

Permalink
server,ui: Add debugging for quiesced ranges
Browse files Browse the repository at this point in the history
I suspect that cockroachdb#26257 is caused by the unquiescedReplicas map
introduced in cockroachdb#24956 getting out of sync with the per-replica
quiescent flag. Add debug pages to help us see if that's happening.

Release note: None
  • Loading branch information
bdarnell committed May 31, 2018
1 parent 31141ce commit d807a4e
Show file tree
Hide file tree
Showing 9 changed files with 485 additions and 274 deletions.
5 changes: 5 additions & 0 deletions pkg/server/problem_ranges.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,17 @@ func (s *statusServer) ProblemRanges(
problems.NoLeaseRangeIDs =
append(problems.NoLeaseRangeIDs, info.State.Desc.RangeID)
}
if info.Problems.QuiescentEqualsTicking {
problems.QuiescentEqualsTickingRangeIDs =
append(problems.QuiescentEqualsTickingRangeIDs, info.State.Desc.RangeID)
}
}
sort.Sort(roachpb.RangeIDSlice(problems.UnavailableRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.RaftLeaderNotLeaseHolderRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.NoRaftLeaderRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.NoLeaseRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.UnderreplicatedRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.QuiescentEqualsTickingRangeIDs))
response.ProblemsByNodeID[resp.nodeID] = problems
case <-ctx.Done():
return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error())
Expand Down
696 changes: 430 additions & 266 deletions pkg/server/serverpb/status.pb.go

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ message RangeProblems {
bool no_raft_leader = 3;
bool underreplicated = 4;
bool no_lease = 5;

// Quiescent ranges do not tick by definition, but we track this in
// two different ways and suspect that they're getting out of sync.
// If the replica's quiescent flag doesn't agree with the store's
// list of replicas that are ticking, warn about it.
bool quiescent_equals_ticking = 6;
}

message RangeStatistics {
Expand Down Expand Up @@ -179,6 +185,7 @@ message RangeInfo {
CommandQueueMetrics cmd_q_global = 12 [ (gogoproto.nullable) = false ];
storage.LeaseStatus lease_status = 13 [ (gogoproto.nullable) = false ];
bool quiescent = 14;
bool ticking = 15;
}

message RangesRequest {
Expand Down Expand Up @@ -505,6 +512,11 @@ message ProblemRangesResponse {
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
repeated int64 quiescent_equals_ticking_range_ids = 7 [
(gogoproto.customname) = "QuiescentEqualsTickingRangeIDs",
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
}
reserved 1 to 7;
// NodeID is the node that submitted all the requests.
Expand Down
12 changes: 7 additions & 5 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -1094,16 +1094,18 @@ func (s *statusServer) Ranges(
WritesPerSecond: rep.WritesPerSecond(),
},
Problems: serverpb.RangeProblems{
Unavailable: metrics.Unavailable,
LeaderNotLeaseHolder: metrics.Leader && metrics.LeaseValid && !metrics.Leaseholder,
NoRaftLeader: !storage.HasRaftLeader(raftStatus) && !metrics.Quiescent,
Underreplicated: metrics.Underreplicated,
NoLease: metrics.Leader && !metrics.LeaseValid && !metrics.Quiescent,
Unavailable: metrics.Unavailable,
LeaderNotLeaseHolder: metrics.Leader && metrics.LeaseValid && !metrics.Leaseholder,
NoRaftLeader: !storage.HasRaftLeader(raftStatus) && !metrics.Quiescent,
Underreplicated: metrics.Underreplicated,
NoLease: metrics.Leader && !metrics.LeaseValid && !metrics.Quiescent,
QuiescentEqualsTicking: metrics.Quiescent == metrics.Ticking,
},
CmdQLocal: serverpb.CommandQueueMetrics(metrics.CmdQMetricsLocal),
CmdQGlobal: serverpb.CommandQueueMetrics(metrics.CmdQMetricsGlobal),
LeaseStatus: metrics.LeaseStatus,
Quiescent: metrics.Quiescent,
Ticking: metrics.Ticking,
}
}

Expand Down
15 changes: 14 additions & 1 deletion pkg/storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -6021,7 +6021,13 @@ type ReplicaMetrics struct {
Leaseholder bool
LeaseType roachpb.LeaseType
LeaseStatus LeaseStatus
Quiescent bool

// Quiescent indicates whether the replica believes itself to be quiesced.
Quiescent bool
// Ticking indicates whether the store is ticking the replica. It should be
// the opposite of Quiescent.
Ticking bool

// Is this the replica which collects per-range metrics? This is done either
// on the leader or, if there is no leader, on the largest live replica ID.
RangeCounter bool
Expand Down Expand Up @@ -6050,6 +6056,10 @@ func (r *Replica) Metrics(
r.cmdQMu.Unlock()
r.mu.RUnlock()

r.store.unquiescedReplicas.Lock()
_, ticking := r.store.unquiescedReplicas.m[r.RangeID]
r.store.unquiescedReplicas.Unlock()

return calcReplicaMetrics(
ctx,
now,
Expand All @@ -6060,6 +6070,7 @@ func (r *Replica) Metrics(
leaseStatus,
r.store.StoreID(),
quiescent,
ticking,
cmdQMetricsLocal,
cmdQMetricsGlobal,
)
Expand All @@ -6084,6 +6095,7 @@ func calcReplicaMetrics(
leaseStatus LeaseStatus,
storeID roachpb.StoreID,
quiescent bool,
ticking bool,
cmdQMetricsLocal CommandQueueMetrics,
cmdQMetricsGlobal CommandQueueMetrics,
) ReplicaMetrics {
Expand All @@ -6099,6 +6111,7 @@ func calcReplicaMetrics(
m.Leaseholder = m.LeaseValid && leaseOwner
m.Leader = isRaftLeader(raftStatus)
m.Quiescent = quiescent
m.Ticking = ticking

// We compute an estimated range count across the cluster by counting the
// first live replica in each descriptor. Note that the first live replica is
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8672,7 +8672,7 @@ func TestReplicaMetrics(t *testing.T) {
metrics := calcReplicaMetrics(
context.Background(), hlc.Timestamp{}, config.SystemConfig{},
c.liveness, &c.desc, c.raftStatus, LeaseStatus{},
c.storeID, c.expected.Quiescent, CommandQueueMetrics{}, CommandQueueMetrics{})
c.storeID, c.expected.Quiescent, !c.expected.Quiescent, CommandQueueMetrics{}, CommandQueueMetrics{})
if c.expected != metrics {
t.Fatalf("unexpected metrics:\n%s", pretty.Diff(c.expected, metrics))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,19 @@ const connectionTableColumns: ConnectionTableColumn[] = [
title: "Underreplicated (or slow)",
extract: (problem) => problem.underreplicated_range_ids.length,
},
{
title: "Quiescent equals ticking",
extract: (problem) => problem.quiescent_equals_ticking_range_ids.length,
},
{
title: "Total",
extract: (problem) => {
return problem.unavailable_range_ids.length +
problem.no_raft_leader_range_ids.length +
problem.no_lease_range_ids.length +
problem.raft_leader_not_lease_holder_range_ids.length +
problem.underreplicated_range_ids.length;
problem.underreplicated_range_ids.length +
problem.quiescent_equals_ticking_range_ids.length;
},
},
{ title: "Error", extract: (problem) => problem.error_message },
Expand Down
5 changes: 5 additions & 0 deletions pkg/ui/src/views/reports/containers/problemRanges/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ class ProblemRanges extends React.Component<ProblemRangesProps, {}> {
problems={problems}
extract={(problem) => problem.underreplicated_range_ids}
/>
<ProblemRangeList
name="Quiescent equals ticking"
problems={problems}
extract={(problem) => problem.quiescent_equals_ticking_range_ids}
/>
</div>
);
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/ui/src/views/reports/containers/range/rangeTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const rangeTableDisplayList: RangeTableRow[] = [
{ variable: "problems", display: "Problems", compareToLeader: true },
{ variable: "raftState", display: "Raft State", compareToLeader: false },
{ variable: "quiescent", display: "Quiescent", compareToLeader: true },
{ variable: "ticking", display: "Ticking", compareToLeader: true },
{ variable: "leaseType", display: "Lease Type", compareToLeader: true },
{ variable: "leaseState", display: "Lease State", compareToLeader: true },
{ variable: "leaseHolder", display: "Lease Holder", compareToLeader: true },
Expand Down Expand Up @@ -204,6 +205,9 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
if (problems.unavailable) {
results = _.concat(results, "Unavailable");
}
if (problems.quiescent_equals_ticking) {
results = _.concat(results, "Quiescent equals ticking");
}
if (awaitingGC) {
results = _.concat(results, "Awaiting GC");
}
Expand Down Expand Up @@ -449,6 +453,7 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
problems: this.contentProblems(info.problems, awaitingGC),
raftState: raftState,
quiescent: info.quiescent ? rangeTableQuiescent : rangeTableEmptyContent,
ticking: this.createContent(info.ticking.toString()),
leaseState: leaseState,
leaseHolder: this.createContent(
Print.ReplicaID(rangeID, lease.replica),
Expand Down

0 comments on commit d807a4e

Please sign in to comment.