Skip to content

Commit

Permalink
kvserver: ensure cleanup of all abandoned learners before an `AdminRe…
Browse files Browse the repository at this point in the history
…locateRange`

Previously, after cockroachdb#79405,
`maybeLeaveAtomicChangeReplicasAndRemoveLearners()` could return a range
descriptor that contained learner replicas. This was because of the idempotency
introduced in cockroachdb#79405, which relaxed how/when we failed replication changes due
to descriptor mismatch. The hazard was that, while we would ensure that the
learner we were trying to remove was indeed gone, we wouldn't do enough to
ensure that there weren't new learners that were added to the range due to
other concurrent `AdminRelocateRange` calls.

This was a violation of its contract and could sometimes fail
`AdminRelocateRange` requests.

This commit makes it such that when `maybeLeaveAtomicChangeReplicasAndRemoveLearners`
 returns a non-error response, it guarantees that _all_ the learner replicas
have been removed according to the returned range descriptor.

Release note: None
  • Loading branch information
aayushshah15 committed Apr 20, 2022
1 parent a24275c commit f4d79e9
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 21 deletions.
33 changes: 17 additions & 16 deletions pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -1310,32 +1310,34 @@ func (r *Replica) maybeLeaveAtomicChangeReplicasAndRemoveLearners(
return nil, err
}

if fn := r.store.TestingKnobs().BeforeRemovingDemotedLearner; fn != nil {
if fn := r.store.TestingKnobs().BeforeRemovingLearners; fn != nil {
fn()
}
// Now the config isn't joint any more, but we may have demoted some voters
// into learners. These learners should go as well.
learners := desc.Replicas().LearnerDescriptors()
if len(learners) == 0 {
return desc, nil
}
targets := make([]roachpb.ReplicationTarget, len(learners))
for i := range learners {
targets[i].NodeID = learners[i].NodeID
targets[i].StoreID = learners[i].StoreID
}
log.VEventf(ctx, 2, `removing learner replicas %v from %v`, targets, desc)
// NB: unroll the removals because at the time of writing, we can't atomically
// remove multiple learners. This will be fixed in:
//
// https://github.com/cockroachdb/cockroach/pull/40268
origDesc := desc
store := r.store
for _, target := range targets {
// Now the config isn't joint any more, but we may have demoted some voters
// into learners. These learners should go as well.
for {
// We loop until we've removed all learners. After every call to
// `execChangeReplicasTxn`, we re-check if there are any learners to be
// removed. This is to guarantee that when this method returns a non-error
// response, it will never return a `desc` that has learners.
learners := desc.Replicas().LearnerDescriptors()
if len(learners) == 0 {
return desc, nil
}
removalTarget := roachpb.ReplicationTarget{
NodeID: learners[0].NodeID, StoreID: learners[0].StoreID,
}
var err error
log.VEventf(ctx, 2, `removing learner replica %v from %v`, removalTarget, desc)
desc, err = execChangeReplicasTxn(
ctx, desc, kvserverpb.ReasonAbandonedLearner, "",
[]internalReplicationChange{{target: target, typ: internalChangeTypeRemoveLearner}},
[]internalReplicationChange{{target: removalTarget, typ: internalChangeTypeRemoveLearner}},
changeReplicasTxnArgs{db: store.DB(),
liveAndDeadReplicas: store.allocator.storePool.liveAndDeadReplicas,
logChange: store.logChange,
Expand All @@ -1347,7 +1349,6 @@ func (r *Replica) maybeLeaveAtomicChangeReplicasAndRemoveLearners(
return nil, errors.Wrapf(err, `removing learners from %s`, origDesc)
}
}
return desc, nil
}

// validateAdditionsPerStore ensures that we're not trying to add the same type
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/kvserver/replica_learner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,7 @@ func TestDemotedLearnerRemovalHandlesRace(t *testing.T) {
ServerArgs: base.TestServerArgs{
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
BeforeRemovingDemotedLearner: func() {
BeforeRemovingLearners: func() {
if atomic.LoadInt64(&activateTestingKnob) == 1 {
// Signal to the test that the rebalance is now trying to remove
// the demoted learner.
Expand Down Expand Up @@ -1291,7 +1291,7 @@ func TestMergeQueueSeesLearnerOrJointConfig(t *testing.T) {
require.NoError(t, processErr)
formattedTrace := trace.String()
expectedMessages := []string{
`removing learner replicas \[n2,s2\]`,
`removing learner replica.*n2,s2`,
`merging to produce range: /Table/Max-/Max`,
}
if err := testutils.MatchInOrder(formattedTrace, expectedMessages...); err != nil {
Expand Down
6 changes: 3 additions & 3 deletions pkg/kv/kvserver/testing_knobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,9 @@ type StoreTestingKnobs struct {
// through a joint configuration, even when this isn't necessary (because
// the replication change affects only one replica).
ReplicationAlwaysUseJointConfig func() bool
// BeforeRemovingDemotedLearner is run before a demoted learner (i.e. a
// learner that results from a range exiting its joint config) is removed.
BeforeRemovingDemotedLearner func()
// BeforeRemovingLearners is run before an abandoned or demoted learner (i.e.
// a learner that results from a range exiting its joint config) is removed.
BeforeRemovingLearners func()
// BeforeSnapshotSSTIngestion is run just before the SSTs are ingested when
// applying a snapshot.
BeforeSnapshotSSTIngestion func(IncomingSnapshot, kvserverpb.SnapshotRequest_Type, []string) error
Expand Down

0 comments on commit f4d79e9

Please sign in to comment.