Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Only primary with slots has the right to mark a node as failed #634

Merged
merged 7 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ int verifyClusterNodeId(const char *name, int length);
sds clusterEncodeOpenSlotsAuxField(int rdbflags);
int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s);

/* Only primaries that own slots have voting rights.
* Returns 1 if the node has voting rights, otherwise returns 0. */
static inline int clusterNodeIsVotingPrimary(clusterNode *n) {
return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots;
}

int getNodeDefaultClientPort(clusterNode *n) {
return server.tls_cluster ? n->tls_port : n->tcp_port;
}
Expand Down Expand Up @@ -1867,8 +1873,8 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
if (nodeFailed(node)) return; /* Already FAILing. */

failures = clusterNodeFailureReportsCount(node);
/* Also count myself as a voter if I'm a primary. */
if (clusterNodeIsPrimary(myself)) failures++;
/* Also count myself as a voter if I'm a voting primary. */
if (clusterNodeIsVotingPrimary(myself)) failures++;
if (failures < needed_quorum) return; /* No weak agreement from primaries. */

serverLog(LL_NOTICE, "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename);
Expand Down Expand Up @@ -1908,7 +1914,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
* 1) The FAIL state is old enough.
* 2) It is yet serving slots from our point of view (not failed over).
* Apparently no one is going to fix these slots, clear the FAIL flag. */
if (clusterNodeIsPrimary(node) && node->numslots > 0 &&
if (clusterNodeIsVotingPrimary(node) &&
(now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) {
serverLog(
LL_NOTICE,
Expand Down Expand Up @@ -2090,8 +2096,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
/* Ignore gossips about self. */
if (node && node != myself) {
/* We already know this node.
Handle failure reports, only when the sender is a primary. */
if (sender && clusterNodeIsPrimary(sender)) {
Handle failure reports, only when the sender is a voting primary. */
if (sender && clusterNodeIsVotingPrimary(sender)) {
if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) {
if (clusterNodeAddFailureReport(node, sender)) {
serverLog(LL_VERBOSE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.",
Expand Down Expand Up @@ -3266,8 +3272,7 @@ int clusterProcessPacket(clusterLink *link) {
/* We consider this vote only if the sender is a primary serving
* a non zero number of slots, and its currentEpoch is greater or
* equal to epoch where this node started the election. */
if (clusterNodeIsPrimary(sender) && sender->numslots > 0 &&
senderCurrentEpoch >= server.cluster->failover_auth_epoch) {
if (clusterNodeIsVotingPrimary(sender) && senderCurrentEpoch >= server.cluster->failover_auth_epoch) {
server.cluster->failover_auth_count++;
/* Maybe we reached a quorum here, set a flag to make sure
* we check ASAP. */
Expand Down Expand Up @@ -4779,7 +4784,7 @@ void clusterCron(void) {
if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) {
node->flags |= CLUSTER_NODE_PFAIL;
update_state = 1;
if (clusterNodeIsPrimary(myself) && server.cluster->size == 1) {
if (server.cluster->size == 1 && clusterNodeIsVotingPrimary(myself)) {
markNodeAsFailingIfNeeded(node);
} else {
serverLog(LL_DEBUG, "*** NODE %.40s possibly failing", node->name);
Expand Down Expand Up @@ -5049,7 +5054,7 @@ void clusterUpdateState(void) {
while ((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);

if (clusterNodeIsPrimary(node) && node->numslots) {
if (clusterNodeIsVotingPrimary(node)) {
server.cluster->size++;
if ((node->flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) == 0) reachable_primaries++;
}
Expand Down
68 changes: 68 additions & 0 deletions tests/unit/cluster/failure-marking.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ start_cluster 1 1 {tags {external:skip cluster}} {
pause_process $replica1_pid

wait_node_marked_fail 0 $replica1_instance_id

resume_process $replica1_pid
}
}

Expand Down Expand Up @@ -49,5 +51,71 @@ start_cluster 2 1 {tags {external:skip cluster}} {
resume_process $primary2_pid

wait_node_marked_fail 0 $replica1_instance_id

resume_process $replica1_pid
}
}

set old_singledb $::singledb
set ::singledb 1

tags {external:skip tls:skip cluster} {
set base_conf [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000 save ""]
start_multiple_servers 5 [list overrides $base_conf] {
test "Only primary with slots has the right to mark a node as failed" {
set primary_host [srv 0 host]
set primary_port [srv 0 port]
set primary_pid [srv 0 pid]
set primary_id [R 0 CLUSTER MYID]
set replica_id [R 1 CLUSTER MYID]
set replica_pid [srv -1 pid]

# Meet others nodes.
R 1 CLUSTER MEET $primary_host $primary_port
R 2 CLUSTER MEET $primary_host $primary_port
R 3 CLUSTER MEET $primary_host $primary_port
R 4 CLUSTER MEET $primary_host $primary_port

# Build a single primary cluster.
cluster_allocate_slots 1 1
wait_for_cluster_propagation
R 1 CLUSTER REPLICATE $primary_id
wait_for_cluster_propagation
wait_for_cluster_state "ok"

# Pause the primary, marking the primary as pfail.
pause_process $primary_pid
wait_node_marked_pfail 1 $primary_id
wait_node_marked_pfail 2 $primary_id
wait_node_marked_pfail 3 $primary_id
wait_node_marked_pfail 4 $primary_id

# Pause the replica, marking the replica as pfail.
pause_process $replica_pid
wait_node_marked_pfail 2 $replica_id
wait_node_marked_pfail 3 $replica_id
wait_node_marked_pfail 4 $replica_id

# Resume the primary, marking the replica as fail.
resume_process $primary_pid
wait_node_marked_fail 0 $replica_id
wait_node_marked_fail 2 $replica_id
wait_node_marked_fail 3 $replica_id
wait_node_marked_fail 4 $replica_id

# Check if we got the right failure reports.
wait_for_condition 1000 50 {
[R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 &&
[R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
[R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
[R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1
} else {
fail "Cluster COUNT-FAILURE-REPORTS is not right."
}

resume_process $replica_pid
}
}
}

set ::singledb $old_singledb
Loading