Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix replica unable trigger migration when it received CLUSTER SETSLOT in advance #981

Merged
merged 11 commits into from
Sep 13, 2024
21 changes: 21 additions & 0 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -6390,7 +6390,9 @@ void clusterCommandSetSlot(client *c) {
server.cluster->migrating_slots_to[slot] = NULL;
}

clusterNode *my_primary = clusterNodeGetPrimary(myself);
int slot_was_mine = server.cluster->slots[slot] == myself;
int slot_was_my_primary = server.cluster->slots[slot] == my_primary;
clusterDelSlot(slot);
clusterAddSlot(n, slot);

Expand All @@ -6407,6 +6409,25 @@ void clusterCommandSetSlot(client *c) {
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
}

/* If we are a replica and executing the CLUSTER SETSLOT command from
* my primary, and my primary left without slots, we should turn into a
* replica of the new primary. */
if (nodeIsReplica(myself) && slot_was_my_primary && my_primary->numslots == 0 &&
server.cluster_allow_replica_migration) {
serverLog(LL_NOTICE,
"Lost my last slot during slot migration. Reconfiguring myself "
"as a replica of %.40s (%s) in shard %.40s",
n->name, n->human_nodename, n->shard_id);
/* We are a replica and the client is actually my primary, the following
* clusterSetPrimary will free the client so we need to protect it. */
protectClient(c);
/* We are migrating to a different shard that has a completely different
* replication history, so a full sync is required. */
clusterSetPrimary(n, 1, 1);
unprotectClient(c);
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
}

/* If this node or this node's primary was importing this slot,
* assigning the slot to itself also clears the importing status. */
if ((n == myself || n == myself->replicaof) && server.cluster->importing_slots_from[slot]) {
Expand Down
1 change: 1 addition & 0 deletions src/networking.c
Original file line number Diff line number Diff line change
Expand Up @@ -2560,6 +2560,7 @@ void freeSharedQueryBuf(void) {
*
* * DEBUG RELOAD and similar.
* * When a Lua script is in -BUSY state.
* * A cluster replica doing CLUSTER SETSLOT and doing migration.
*
* So the function will protect the client by doing two things:
*
Expand Down
25 changes: 23 additions & 2 deletions tests/unit/cluster/replica-migration.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,17 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
test_sub_replica "sigstop"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT" {
proc test_cluster_setslot {type} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT - $type" {
R 3 config set cluster-allow-replica-migration no
R 7 config set cluster-allow-replica-migration yes

if {$type == "setslot"} {
# Make R 7 drop the PING message so that we have a higher
# chance to trigger the migration from CLUSTER SETSLOT.
R 7 DEBUG DROP-CLUSTER-PACKET-FILTER 1
}

# Move slot 0 from primary 3 to primary 0.
set addr "[srv 0 host]:[srv 0 port]"
set myid [R 3 CLUSTER MYID]
Expand All @@ -349,6 +355,13 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
}

# Wait for R 3 to report that it is an empty replica (cluster-allow-replica-migration no)
wait_for_log_messages -3 {"*I am now an empty primary*"} 0 1000 50

if {$type == "setslot"} {
R 7 DEBUG DROP-CLUSTER-PACKET-FILTER -1
}

# Make sure server 3 lost its replica (server 7) and server 7 becomes a replica of primary 0.
wait_for_condition 1000 50 {
[s -3 role] eq {master} &&
Expand All @@ -361,4 +374,12 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "Server 3 and 7 role response has not changed"
}
}
}

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_cluster_setslot "gossip"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_cluster_setslot "setslot"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster
Loading