From 71ca22dcfdd8b1fb8ebfd62315dab2e2c7c23806 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 14 Jun 2023 13:15:15 +0000 Subject: [PATCH] kvserver: add metric for Raft leader removals This patch adds the metric `range.raftleaderremovals` which counts the number of times a Raft leader was removed from a range via a config change. Other removals, such as range merges, are excluded. Epic: none Release note: None --- pkg/kv/kvserver/metrics.go | 8 ++++++++ pkg/kv/kvserver/replica_application_result.go | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 2d083919049b..e0a323a80816 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1044,6 +1044,12 @@ var ( Measurement: "Leader Transfers", Unit: metric.Unit_COUNT, } + metaRangeRaftLeaderRemovals = metric.Metadata{ + Name: "range.raftleaderremovals", + Help: "Number of times the current Raft leader was removed from a range", + Measurement: "Raft leader removals", + Unit: metric.Unit_COUNT, + } metaRangeLossOfQuorumRecoveries = metric.Metadata{ Name: "range.recoveries", Help: `Count of offline loss of quorum recovery operations performed on ranges. @@ -2199,6 +2205,7 @@ type StoreMetrics struct { RangeAdds *metric.Counter RangeRemoves *metric.Counter RangeRaftLeaderTransfers *metric.Counter + RangeRaftLeaderRemovals *metric.Counter RangeLossOfQuorumRecoveries *metric.Counter // Range snapshot metrics. @@ -2855,6 +2862,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { RangeSnapshotSendQueueSize: metric.NewGauge(metaRangeSnapshotSendQueueSize), RangeSnapshotRecvQueueSize: metric.NewGauge(metaRangeSnapshotRecvQueueSize), RangeRaftLeaderTransfers: metric.NewCounter(metaRangeRaftLeaderTransfers), + RangeRaftLeaderRemovals: metric.NewCounter(metaRangeRaftLeaderRemovals), RangeLossOfQuorumRecoveries: metric.NewCounter(metaRangeLossOfQuorumRecoveries), DelegateSnapshotSendBytes: metric.NewCounter(metaDelegateSnapshotSendBytes), DelegateSnapshotSuccesses: metric.NewCounter(metaDelegateSnapshotSuccesses), diff --git a/pkg/kv/kvserver/replica_application_result.go b/pkg/kv/kvserver/replica_application_result.go index 769e85bc6fde..14c8db463397 100644 --- a/pkg/kv/kvserver/replica_application_result.go +++ b/pkg/kv/kvserver/replica_application_result.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/stop" "github.com/cockroachdb/errors" + "go.etcd.io/raft/v3" ) // replica_application_*.go files provide concrete implementations of @@ -422,6 +423,12 @@ func (r *Replica) handleChangeReplicasResult( log.Infof(ctx, "removing replica due to ChangeReplicasTrigger: %v", chng) } + // This is currently executed before the conf change is applied to the Raft + // node, so we still see ourselves as the leader. + if r.raftBasicStatusRLocked().RaftState == raft.StateLeader { + r.store.metrics.RangeRaftLeaderRemovals.Inc(1) + } + if _, err := r.store.removeInitializedReplicaRaftMuLocked(ctx, r, chng.NextReplicaID(), RemoveOptions{ // We destroyed the data when the batch committed so don't destroy it again. DestroyData: false,