From 0e608b06bf24322ccf3c6fb1a201aee96d55c9fe Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 17 May 2023 14:56:38 +0200 Subject: [PATCH] kvserver: don't quiesce with follower in StateProbe shouldReplicaQuiesce checks that all followers are fully caught up, but it's still possible to have a follower in StateProbe (because we call `rawNode.ReportUnreachable` when an outgoing message gets dropped). Persistent StateProbe is problematic: we check for it before lease transfers, so despite the follower being fully caught up, we'd refuse the transfer. Unfortunately, this commit itself is not enough: even if the range is not quiesced, it needs to replicate a new entry to rectify the situation (i.e. switch follower back to StateReplicate). This is because at the time of writing, receipt of a heartbeat response from the follower is not enough to move it back to StateReplicate. This was fixed upstream, in https://github.com/cockroachdb/cockroach/pull/103826. However, this is still not enough! If the range quiesces successfully, and *then* `ReportUnreachable` is called, we still end up in the same state. TODO file issue about this. I ran into the above issue on https://github.com/cockroachdb/cockroach/pull/99191, which adds persistent circuit breakers, when stressing `TestStoreMetrics`. That test happens to restart n2 when it's fully caught up and due to the persistence of the circuit breakers when it comes up the leader will move it into StateProbe (since we can end up dropping the first heartbeat sent to it as it comes up, since the breaker hasn't untripped yet). But, I believe that this bug is real even without this breaker re-work, just harder to trigger. Epic: none Release note (bug fix): fixed a problem that could lead to erroneously refused lease transfers (error message: "refusing to transfer lease to [...] because target may need a Raft snapshot: replica in StateProbe" --- pkg/kv/kvserver/replica_raft_quiesce.go | 7 ++++--- pkg/kv/kvserver/replica_test.go | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pkg/kv/kvserver/replica_raft_quiesce.go b/pkg/kv/kvserver/replica_raft_quiesce.go index 148cd073758e..d8cf55e0a5a8 100644 --- a/pkg/kv/kvserver/replica_raft_quiesce.go +++ b/pkg/kv/kvserver/replica_raft_quiesce.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "go.etcd.io/raft/v3" "go.etcd.io/raft/v3/raftpb" + "go.etcd.io/raft/v3/tracker" ) // quiesceAfterTicks is the number of ticks without proposals after which ranges @@ -420,7 +421,7 @@ func shouldReplicaQuiesce( rep.ReplicaID, progress) } return nil, nil, false - } else if progress.Match != status.Applied { + } else if progress.Match != status.Applied || progress.State != tracker.StateReplicate { // Skip any node in the descriptor which is not live. Instead, add // the node to the set of replicas lagging the quiescence index. if l, ok := livenessMap[rep.NodeID]; ok && !l.IsLive { @@ -432,8 +433,8 @@ func shouldReplicaQuiesce( continue } if log.V(4) { - log.Infof(ctx, "not quiescing: replica %d match (%d) != applied (%d)", - rep.ReplicaID, progress.Match, status.Applied) + log.Infof(ctx, "not quiescing: replica %d match (%d) != applied (%d) or state %s not admissible", + rep.ReplicaID, progress.Match, status.Applied, progress.State) } return nil, nil, false } diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index 6e57895c8af2..aeb04de245a3 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -10065,9 +10065,9 @@ func TestShouldReplicaQuiesce(t *testing.T) { LeadTransferee: 0, }, Progress: map[uint64]tracker.Progress{ - 1: {Match: logIndex}, - 2: {Match: logIndex}, - 3: {Match: logIndex}, + 1: {Match: logIndex, State: tracker.StateReplicate}, + 2: {Match: logIndex, State: tracker.StateReplicate}, + 3: {Match: logIndex, State: tracker.StateReplicate}, }, }, lastIndex: logIndex, @@ -10209,6 +10209,12 @@ func TestShouldReplicaQuiesce(t *testing.T) { q.raftReady = true return q }) + test(false, func(q *testQuiescer) *testQuiescer { + pr := q.status.Progress[2] + pr.State = tracker.StateProbe + q.status.Progress[2] = pr + return q + }) // Create a mismatch between the raft progress replica IDs and the // replica IDs in the range descriptor. for i := 0; i < 3; i++ {