diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go index 40656f9051da..434535c9bebd 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go @@ -934,6 +934,7 @@ func (a *Allocator) computeAction( // actions. haveVoters := len(voterReplicas) decommissioningVoters := storePool.DecommissioningReplicas(voterReplicas) + postDecommissionVoters := haveVoters - len(decommissioningVoters) // Node count including dead nodes but excluding // decommissioning/decommissioned nodes. clusterNodes := storePool.ClusterNodeCount() @@ -975,9 +976,9 @@ func (a *Allocator) computeAction( return action, action.Priority() } - if haveVoters == neededVoters && len(deadVoters) > 0 { + if postDecommissionVoters <= neededVoters && len(deadVoters) > 0 { // Range has dead voter(s). We should up-replicate to add another before - // before removing the dead one. This can avoid permanent data loss in cases + // removing the dead one. This can avoid permanent data loss in cases // where the node is only temporarily dead, but we remove it from the range // and lose a second node before we can up-replicate (#25392). action = AllocatorReplaceDeadVoter @@ -986,7 +987,7 @@ func (a *Allocator) computeAction( return action, action.Priority() } - if haveVoters == neededVoters && len(decommissioningVoters) > 0 { + if postDecommissionVoters < neededVoters { // Range has decommissioning voter(s), which should be replaced. action = AllocatorReplaceDecommissioningVoter log.KvDistribution.VEventf(ctx, 3, "%s - replacement for %d decommissioning voters priority=%.2f", @@ -1041,10 +1042,13 @@ func (a *Allocator) computeAction( return action, action.Priority() } + decommissioningNonVoters := storePool.DecommissioningReplicas(nonVoterReplicas) + postDecommissionNonVoters := haveNonVoters - len(decommissioningNonVoters) liveNonVoters, deadNonVoters := storePool.LiveAndDeadReplicas( nonVoterReplicas, includeSuspectAndDrainingStores, ) - if haveNonVoters == neededNonVoters && len(deadNonVoters) > 0 { + + if postDecommissionNonVoters <= neededNonVoters && len(deadNonVoters) > 0 { // The range has non-voter(s) on a dead node that we should replace. action = AllocatorReplaceDeadNonVoter log.KvDistribution.VEventf(ctx, 3, "%s - replacement for %d dead non-voters priority=%.2f", @@ -1052,8 +1056,7 @@ func (a *Allocator) computeAction( return action, action.Priority() } - decommissioningNonVoters := storePool.DecommissioningReplicas(nonVoterReplicas) - if haveNonVoters == neededNonVoters && len(decommissioningNonVoters) > 0 { + if postDecommissionNonVoters < neededNonVoters { // The range has non-voter(s) on a decommissioning node that we should // replace. action = AllocatorReplaceDecommissioningNonVoter diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go index cfef3624ee01..2cf8b14b6f1e 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go @@ -6810,7 +6810,7 @@ func TestAllocatorComputeActionDecommission(t *testing.T) { }, }, }, - expectedAction: AllocatorRemoveDeadVoter, + expectedAction: AllocatorReplaceDeadVoter, live: []roachpb.StoreID{1, 4}, dead: []roachpb.StoreID{2}, decommissioning: []roachpb.StoreID{3}, @@ -6843,7 +6843,7 @@ func TestAllocatorComputeActionDecommission(t *testing.T) { }, }, }, - expectedAction: AllocatorRemoveDeadVoter, + expectedAction: AllocatorReplaceDeadVoter, live: []roachpb.StoreID{1, 4}, dead: nil, decommissioning: []roachpb.StoreID{3}, @@ -6903,7 +6903,7 @@ func TestAllocatorComputeActionDecommission(t *testing.T) { }, }, }, - expectedAction: AllocatorRemoveDecommissioningVoter, + expectedAction: AllocatorReplaceDecommissioningVoter, live: []roachpb.StoreID{4}, dead: nil, decommissioning: []roachpb.StoreID{1, 2, 3}, @@ -7090,7 +7090,7 @@ func TestAllocatorComputeActionWithStorePoolDecommission(t *testing.T) { }, }, }, - expectedAction: AllocatorRemoveDeadVoter, + expectedAction: AllocatorReplaceDeadVoter, live: []roachpb.StoreID{1, 4}, dead: []roachpb.StoreID{2}, decommissioning: []roachpb.StoreID{3}, @@ -7123,7 +7123,7 @@ func TestAllocatorComputeActionWithStorePoolDecommission(t *testing.T) { }, }, }, - expectedAction: AllocatorRemoveDeadVoter, + expectedAction: AllocatorReplaceDeadVoter, live: []roachpb.StoreID{1, 4}, dead: nil, decommissioning: []roachpb.StoreID{3}, @@ -7183,7 +7183,7 @@ func TestAllocatorComputeActionWithStorePoolDecommission(t *testing.T) { }, }, }, - expectedAction: AllocatorRemoveDecommissioningVoter, + expectedAction: AllocatorReplaceDecommissioningVoter, live: []roachpb.StoreID{4}, dead: nil, decommissioning: []roachpb.StoreID{1, 2, 3}, @@ -7330,6 +7330,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { // set. We are checking that the effective replication factor is rounded down // to the number of stores which are not decommissioned or decommissioning. testCases := []struct { + name string storeList []roachpb.StoreID expectedNumReplicas int expectedAction AllocatorAction @@ -7342,9 +7343,10 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { // Four known stores, three of them are decommissioning, so effective // replication factor would be 1 if we hadn't decided that we'll never // drop past 3, so 3 it is. + name: "four replicas with three decommissioning", storeList: []roachpb.StoreID{1, 2, 3, 4}, expectedNumReplicas: 3, - expectedAction: AllocatorRemoveDecommissioningVoter, + expectedAction: AllocatorReplaceDecommissioningVoter, live: []roachpb.StoreID{4}, unavailable: nil, dead: nil, @@ -7352,6 +7354,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { }, { // Ditto. + name: "three replicas with three decommissioning", storeList: []roachpb.StoreID{1, 2, 3}, expectedNumReplicas: 3, expectedAction: AllocatorReplaceDecommissioningVoter, @@ -7365,6 +7368,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { // factor would be even (four), in which case we drop down one more // to three. Then the right thing becomes removing the dead replica // from the range at hand, rather than trying to replace it. + name: "four replicas with one dead", storeList: []roachpb.StoreID{1, 2, 3, 4}, expectedNumReplicas: 3, expectedAction: AllocatorRemoveDeadVoter, @@ -7378,6 +7382,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { // in the system which amounts to an effective replication factor // of three (avoiding the even number). Adding a replica is more // important than replacing the dead one. + name: "two replicas with one dead", storeList: []roachpb.StoreID{1, 4}, expectedNumReplicas: 3, expectedAction: AllocatorAddVoter, @@ -7388,6 +7393,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { }, { // Similar to above, but nothing to do. + name: "three replicas with nothing to do", storeList: []roachpb.StoreID{1, 2, 3}, expectedNumReplicas: 3, expectedAction: AllocatorConsiderRebalance, @@ -7400,6 +7406,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { // Effective replication factor can't dip below three (unless the // span config explicitly asks for that, which it does not), so three // it is and we are under-replicaed. + name: "RF stays above three", storeList: []roachpb.StoreID{1, 2}, expectedNumReplicas: 3, expectedAction: AllocatorAddVoter, @@ -7410,6 +7417,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { }, { // Three and happy. + name: "three and happy", storeList: []roachpb.StoreID{1, 2, 3}, expectedNumReplicas: 3, expectedAction: AllocatorConsiderRebalance, @@ -7420,6 +7428,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { }, { // Three again, on account of avoiding the even four. + name: "avoid even replicas", storeList: []roachpb.StoreID{1, 2, 3, 4}, expectedNumReplicas: 3, expectedAction: AllocatorRemoveVoter, @@ -7431,6 +7440,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { { // The usual case in which there are enough nodes to accommodate the // span config. + name: "five and happy", storeList: []roachpb.StoreID{1, 2, 3, 4, 5}, expectedNumReplicas: 5, expectedAction: AllocatorConsiderRebalance, @@ -7442,6 +7452,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { { // No dead or decommissioning node and enough nodes around, so // sticking with the span config. + name: "five and happy with one unavailable", storeList: []roachpb.StoreID{1, 2, 3, 4, 5}, expectedNumReplicas: 5, expectedAction: AllocatorConsiderRebalance, @@ -7452,6 +7463,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { }, { // Ditto. + name: "five and happy with two unavailable", storeList: []roachpb.StoreID{1, 2, 3, 4, 5}, expectedNumReplicas: 5, expectedAction: AllocatorConsiderRebalance, @@ -7462,6 +7474,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { }, { // Ditto, but we've lost quorum. + name: "five with lost quorum", storeList: []roachpb.StoreID{1, 2, 3, 4, 5}, expectedNumReplicas: 5, expectedAction: AllocatorRangeUnavailable, @@ -7474,6 +7487,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { // Ditto (dead nodes don't reduce NumReplicas, only decommissioning // or decommissioned do, and both correspond to the 'decommissioning' // slice in these tests). + name: "five with lost quorum on one dead and one unavailable", storeList: []roachpb.StoreID{1, 2, 3, 4, 5}, expectedNumReplicas: 5, expectedAction: AllocatorReplaceDeadVoter, @@ -7485,6 +7499,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { { // Avoiding four, so getting three, and since there is no dead store // the most important thing is removing a decommissioning replica. + name: "five with one decommissioning and one unavailable", storeList: []roachpb.StoreID{1, 2, 3, 4, 5}, expectedNumReplicas: 3, expectedAction: AllocatorRemoveDecommissioningVoter, @@ -7514,7 +7529,7 @@ func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) { roachpb.RKey(keys.SystemPrefix), } { for _, c := range testCases { - t.Run(prefixKey.String(), func(t *testing.T) { + t.Run(fmt.Sprintf("%s%s", c.name, prefixKey), func(t *testing.T) { numNodes = len(c.storeList) - len(c.decommissioning) mockStorePool(sp, c.live, c.unavailable, c.dead, c.decommissioning, nil, nil) diff --git a/pkg/kv/kvserver/store_test.go b/pkg/kv/kvserver/store_test.go index 9cd597593ac4..e85e6f805a85 100644 --- a/pkg/kv/kvserver/store_test.go +++ b/pkg/kv/kvserver/store_test.go @@ -3469,7 +3469,7 @@ func TestAllocatorCheckRange(t *testing.T) { }{ { name: "overreplicated", - stores: multiRegionStores, + stores: multiRegionStores, // Nine stores existingReplicas: []roachpb.ReplicaDescriptor{ {NodeID: 1, StoreID: 1, ReplicaID: 1}, {NodeID: 2, StoreID: 2, ReplicaID: 2}, @@ -3482,7 +3482,7 @@ func TestAllocatorCheckRange(t *testing.T) { }, { name: "overreplicated but store dead", - stores: multiRegionStores, + stores: multiRegionStores, // Nine stores existingReplicas: []roachpb.ReplicaDescriptor{ {NodeID: 1, StoreID: 1, ReplicaID: 1}, {NodeID: 2, StoreID: 2, ReplicaID: 2}, @@ -3497,7 +3497,7 @@ func TestAllocatorCheckRange(t *testing.T) { }, { name: "decommissioning but underreplicated", - stores: multiRegionStores, + stores: multiRegionStores, // Nine stores existingReplicas: []roachpb.ReplicaDescriptor{ {NodeID: 1, StoreID: 1, ReplicaID: 1}, {NodeID: 2, StoreID: 2, ReplicaID: 2}, @@ -3511,7 +3511,7 @@ func TestAllocatorCheckRange(t *testing.T) { }, { name: "decommissioning with replacement", - stores: multiRegionStores, + stores: multiRegionStores, // Nine stores existingReplicas: []roachpb.ReplicaDescriptor{ {NodeID: 1, StoreID: 1, ReplicaID: 1}, {NodeID: 2, StoreID: 2, ReplicaID: 2}, @@ -3541,7 +3541,7 @@ func TestAllocatorCheckRange(t *testing.T) { }, { name: "five to four nodes at RF five", - stores: noLocalityStores, + stores: noLocalityStores, // Five stores existingReplicas: []roachpb.ReplicaDescriptor{ {NodeID: 1, StoreID: 1, ReplicaID: 1}, {NodeID: 2, StoreID: 2, ReplicaID: 2}, @@ -3558,6 +3558,83 @@ func TestAllocatorCheckRange(t *testing.T) { expectErr: false, expectValidTarget: false, }, + { + name: "five to two nodes at RF five replaces first", + stores: noLocalityStores, // Five stores + existingReplicas: []roachpb.ReplicaDescriptor{ + {NodeID: 1, StoreID: 1, ReplicaID: 1}, + {NodeID: 2, StoreID: 2, ReplicaID: 2}, + {NodeID: 3, StoreID: 3, ReplicaID: 3}, + {NodeID: 4, StoreID: 4, ReplicaID: 4}, + {NodeID: 5, StoreID: 5, ReplicaID: 5}, + }, + zoneConfig: zonepb.DefaultSystemZoneConfigRef(), + livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{ + 3: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 4: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 5: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + }, + baselineExpNoop: true, + expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter, + expectValidTarget: false, + expectAllocatorErr: true, + expectedErrStr: "likely not enough nodes in cluster", + }, + { + name: "replace first when lowering effective RF", + stores: multiRegionStores, // Nine stores + existingReplicas: []roachpb.ReplicaDescriptor{ + // Region "a" + {NodeID: 1, StoreID: 1, ReplicaID: 1}, + {NodeID: 2, StoreID: 2, ReplicaID: 2}, + // Region "b" + {NodeID: 4, StoreID: 4, ReplicaID: 3}, + {NodeID: 5, StoreID: 5, ReplicaID: 4}, + // Region "c" + {NodeID: 7, StoreID: 7, ReplicaID: 5}, + }, + zoneConfig: zonepb.DefaultSystemZoneConfigRef(), + livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{ + // Downsize to one node per region: 3,6,9. + 1: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 2: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 4: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 5: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 7: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 8: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + }, + expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter, + expectErr: false, + expectValidTarget: true, + }, + { + name: "replace dead first when lowering effective RF", + stores: multiRegionStores, // Nine stores + existingReplicas: []roachpb.ReplicaDescriptor{ + // Region "a" + {NodeID: 1, StoreID: 1, ReplicaID: 1}, + {NodeID: 2, StoreID: 2, ReplicaID: 2}, + // Region "b" + {NodeID: 4, StoreID: 4, ReplicaID: 3}, + {NodeID: 5, StoreID: 5, ReplicaID: 4}, + // Region "c" + {NodeID: 7, StoreID: 7, ReplicaID: 5}, + }, + zoneConfig: zonepb.DefaultSystemZoneConfigRef(), + livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{ + // Downsize to: 1,4,7,9 but 7 is dead. + // Replica on n7 should be replaced first. + 2: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 3: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 5: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 6: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + 7: livenesspb.NodeLivenessStatus_DEAD, + 8: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + }, + expectedAction: allocatorimpl.AllocatorReplaceDeadVoter, + expectErr: false, + expectValidTarget: true, + }, } { t.Run(tc.name, func(t *testing.T) { // Setup store pool based on store descriptors and configure test store. @@ -3629,6 +3706,10 @@ func TestAllocatorCheckRange(t *testing.T) { true /* collectTraces */, storePoolOverride, ) + require.Equalf(t, tc.expectedAction, action, + "expected action \"%s\", got action \"%s\"", tc.expectedAction, action, + ) + // Validate expectations from test case. if tc.expectErr || tc.expectAllocatorErr { require.Error(t, err) @@ -3647,10 +3728,6 @@ func TestAllocatorCheckRange(t *testing.T) { require.NoError(t, err) } - require.Equalf(t, tc.expectedAction, action, - "expected action \"%s\", got action \"%s\"", tc.expectedAction, action, - ) - if tc.expectValidTarget { require.NotEqualf(t, roachpb.ReplicationTarget{}, target, "expected valid target") }