diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 305928ebfffd..e26e88906005 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -1310,32 +1310,34 @@ func (r *Replica) maybeLeaveAtomicChangeReplicasAndRemoveLearners( return nil, err } - if fn := r.store.TestingKnobs().BeforeRemovingDemotedLearner; fn != nil { + if fn := r.store.TestingKnobs().BeforeRemovingLearners; fn != nil { fn() } - // Now the config isn't joint any more, but we may have demoted some voters - // into learners. These learners should go as well. - learners := desc.Replicas().LearnerDescriptors() - if len(learners) == 0 { - return desc, nil - } - targets := make([]roachpb.ReplicationTarget, len(learners)) - for i := range learners { - targets[i].NodeID = learners[i].NodeID - targets[i].StoreID = learners[i].StoreID - } - log.VEventf(ctx, 2, `removing learner replicas %v from %v`, targets, desc) // NB: unroll the removals because at the time of writing, we can't atomically // remove multiple learners. This will be fixed in: // // https://github.com/cockroachdb/cockroach/pull/40268 origDesc := desc store := r.store - for _, target := range targets { + // Now the config isn't joint any more, but we may have demoted some voters + // into learners. These learners should go as well. + for { + // We loop until we've removed all learners. After every call to + // `execChangeReplicasTxn`, we re-check if there are any learners to be + // removed. This is to guarantee that when this method returns a non-error + // response, it will never return a `desc` that has learners. + learners := desc.Replicas().LearnerDescriptors() + if len(learners) == 0 { + return desc, nil + } + removalTarget := roachpb.ReplicationTarget{ + NodeID: learners[0].NodeID, StoreID: learners[0].StoreID, + } var err error + log.VEventf(ctx, 2, `removing learner replica %v from %v`, removalTarget, desc) desc, err = execChangeReplicasTxn( ctx, desc, kvserverpb.ReasonAbandonedLearner, "", - []internalReplicationChange{{target: target, typ: internalChangeTypeRemoveLearner}}, + []internalReplicationChange{{target: removalTarget, typ: internalChangeTypeRemoveLearner}}, changeReplicasTxnArgs{db: store.DB(), liveAndDeadReplicas: store.allocator.storePool.liveAndDeadReplicas, logChange: store.logChange, @@ -1347,7 +1349,6 @@ func (r *Replica) maybeLeaveAtomicChangeReplicasAndRemoveLearners( return nil, errors.Wrapf(err, `removing learners from %s`, origDesc) } } - return desc, nil } // validateAdditionsPerStore ensures that we're not trying to add the same type diff --git a/pkg/kv/kvserver/replica_learner_test.go b/pkg/kv/kvserver/replica_learner_test.go index c20d0935fd2a..2bb0f8b7ddd4 100644 --- a/pkg/kv/kvserver/replica_learner_test.go +++ b/pkg/kv/kvserver/replica_learner_test.go @@ -1096,7 +1096,7 @@ func TestDemotedLearnerRemovalHandlesRace(t *testing.T) { ServerArgs: base.TestServerArgs{ Knobs: base.TestingKnobs{ Store: &kvserver.StoreTestingKnobs{ - BeforeRemovingDemotedLearner: func() { + BeforeRemovingLearners: func() { if atomic.LoadInt64(&activateTestingKnob) == 1 { // Signal to the test that the rebalance is now trying to remove // the demoted learner. @@ -1291,7 +1291,7 @@ func TestMergeQueueSeesLearnerOrJointConfig(t *testing.T) { require.NoError(t, processErr) formattedTrace := trace.String() expectedMessages := []string{ - `removing learner replicas \[n2,s2\]`, + `removing learner replica.*n2,s2`, `merging to produce range: /Table/Max-/Max`, } if err := testutils.MatchInOrder(formattedTrace, expectedMessages...); err != nil { diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 70380c39e26d..e7bb6ed95218 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -304,9 +304,9 @@ type StoreTestingKnobs struct { // through a joint configuration, even when this isn't necessary (because // the replication change affects only one replica). ReplicationAlwaysUseJointConfig func() bool - // BeforeRemovingDemotedLearner is run before a demoted learner (i.e. a - // learner that results from a range exiting its joint config) is removed. - BeforeRemovingDemotedLearner func() + // BeforeRemovingLearners is run before an abandoned or demoted learner (i.e. + // a learner that results from a range exiting its joint config) is removed. + BeforeRemovingLearners func() // BeforeSnapshotSSTIngestion is run just before the SSTs are ingested when // applying a snapshot. BeforeSnapshotSSTIngestion func(IncomingSnapshot, kvserverpb.SnapshotRequest_Type, []string) error