From 4e25c4583e44d7ec64a9a75de598847014f713d8 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 12 Jun 2024 15:39:28 +0800 Subject: [PATCH 1/6] Only primary with slots has the right to mark a node as failed In markNodeAsFailingIfNeeded we will count needed_quorum and failures, needed_quorum is the half the cluster->size and plus one, and cluster-size is the size of primary node which contain slots, but when counting failures, we dit not check if primary has slots. Signed-off-by: Binbin --- src/cluster_legacy.c | 6 +-- tests/unit/cluster/failure-marking.tcl | 68 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index cd3786fe05..2cffc32e52 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1868,7 +1868,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { failures = clusterNodeFailureReportsCount(node); /* Also count myself as a voter if I'm a primary. */ - if (clusterNodeIsPrimary(myself)) failures++; + if (clusterNodeIsPrimary(myself) && myself->numslots) failures++; if (failures < needed_quorum) return; /* No weak agreement from primaries. */ serverLog(LL_NOTICE, "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); @@ -2091,7 +2091,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { if (node && node != myself) { /* We already know this node. Handle failure reports, only when the sender is a primary. */ - if (sender && clusterNodeIsPrimary(sender)) { + if (sender && clusterNodeIsPrimary(sender) && sender->numslots) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { if (clusterNodeAddFailureReport(node, sender)) { serverLog(LL_VERBOSE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", @@ -4779,7 +4779,7 @@ void clusterCron(void) { if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) { node->flags |= CLUSTER_NODE_PFAIL; update_state = 1; - if (clusterNodeIsPrimary(myself) && server.cluster->size == 1) { + if (server.cluster->size == 1 && clusterNodeIsPrimary(myself) && myself->numslots) { markNodeAsFailingIfNeeded(node); } else { serverLog(LL_DEBUG, "*** NODE %.40s possibly failing", node->name); diff --git a/tests/unit/cluster/failure-marking.tcl b/tests/unit/cluster/failure-marking.tcl index c4746c8264..cfed7fff0f 100644 --- a/tests/unit/cluster/failure-marking.tcl +++ b/tests/unit/cluster/failure-marking.tcl @@ -16,6 +16,8 @@ start_cluster 1 1 {tags {external:skip cluster}} { pause_process $replica1_pid wait_node_marked_fail 0 $replica1_instance_id + + resume_process $replica1_pid } } @@ -49,5 +51,71 @@ start_cluster 2 1 {tags {external:skip cluster}} { resume_process $primary2_pid wait_node_marked_fail 0 $replica1_instance_id + + resume_process $replica1_pid + } +} + +set old_singledb $::singledb +set ::singledb 1 + +tags {external:skip tls:skip cluster} { + set base_conf [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000 save ""] + start_multiple_servers 5 [list overrides $base_conf] { + test "Only primary with slots has the right to mark a node as failed" { + set primary_host [srv 0 host] + set primary_port [srv 0 port] + set primary_pid [srv 0 pid] + set primary_id [R 0 CLUSTER MYID] + set replica_id [R 1 CLUSTER MYID] + set replica_pid [srv -1 pid] + + # Meet others nodes. + R 1 CLUSTER MEET $primary_host $primary_port + R 2 CLUSTER MEET $primary_host $primary_port + R 3 CLUSTER MEET $primary_host $primary_port + R 4 CLUSTER MEET $primary_host $primary_port + + # Build a single primary cluster. + cluster_allocate_slots 1 1 + wait_for_cluster_propagation + R 1 CLUSTER REPLICATE $primary_id + wait_for_cluster_propagation + wait_for_cluster_state "ok" + + # Pause the primary, marking the primary as pfail. + pause_process $primary_pid + wait_node_marked_pfail 1 $primary_id + wait_node_marked_pfail 2 $primary_id + wait_node_marked_pfail 3 $primary_id + wait_node_marked_pfail 4 $primary_id + + # Pause the replica, marking the replica as pfail. + pause_process $replica_pid + wait_node_marked_pfail 2 $replica_id + wait_node_marked_pfail 3 $replica_id + wait_node_marked_pfail 4 $replica_id + + # Resume the primary, marking the replica as fail. + resume_process $primary_pid + wait_node_marked_fail 0 $replica_id + wait_node_marked_fail 2 $replica_id + wait_node_marked_fail 3 $replica_id + wait_node_marked_fail 4 $replica_id + + # Check if we got the right failure reports. + wait_for_condition 1000 50 { + [R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 && + [R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 && + [R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 && + [R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 + } else { + fail "Cluster COUNT-FAILURE-REPORTS is not right." + } + + resume_process $replica_pid + } } } + +set ::singledb $old_singledb From 6845725d8bc5c928e687bd7e542976ef849ef1a3 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 13 Jun 2024 14:57:01 +0800 Subject: [PATCH 2/6] update comment Signed-off-by: Binbin --- src/cluster_legacy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 2cffc32e52..8be59caee6 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1867,7 +1867,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { if (nodeFailed(node)) return; /* Already FAILing. */ failures = clusterNodeFailureReportsCount(node); - /* Also count myself as a voter if I'm a primary. */ + /* Also count myself as a voter if I'm a primary with slots. */ if (clusterNodeIsPrimary(myself) && myself->numslots) failures++; if (failures < needed_quorum) return; /* No weak agreement from primaries. */ @@ -2090,7 +2090,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Ignore gossips about self. */ if (node && node != myself) { /* We already know this node. - Handle failure reports, only when the sender is a primary. */ + Handle failure reports, only when the sender is a primary with slots. */ if (sender && clusterNodeIsPrimary(sender) && sender->numslots) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { if (clusterNodeAddFailureReport(node, sender)) { From 897300f5a827ff06dd4d90a754786c8dacc2234d Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 13 Jun 2024 16:09:33 +0800 Subject: [PATCH 3/6] add a new clusterNodeIsVotingPrimary Signed-off-by: Binbin --- src/cluster.h | 1 + src/cluster_legacy.c | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index f163e7f688..540e5e95d9 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -84,6 +84,7 @@ int getMyShardSlotCount(void); int handleDebugClusterCommand(client *c); int clusterNodePending(clusterNode *node); int clusterNodeIsPrimary(clusterNode *n); +int clusterNodeIsVotingPrimary(clusterNode *n); char **getClusterNodesList(size_t *numnodes); char *clusterNodeIp(clusterNode *node); int clusterNodeIsReplica(clusterNode *node); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 8be59caee6..cdebe19993 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1867,8 +1867,8 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { if (nodeFailed(node)) return; /* Already FAILing. */ failures = clusterNodeFailureReportsCount(node); - /* Also count myself as a voter if I'm a primary with slots. */ - if (clusterNodeIsPrimary(myself) && myself->numslots) failures++; + /* Also count myself as a voter if I'm a voting primary. */ + if (clusterNodeIsVotingPrimary(myself)) failures++; if (failures < needed_quorum) return; /* No weak agreement from primaries. */ serverLog(LL_NOTICE, "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); @@ -1908,7 +1908,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { * 1) The FAIL state is old enough. * 2) It is yet serving slots from our point of view (not failed over). * Apparently no one is going to fix these slots, clear the FAIL flag. */ - if (clusterNodeIsPrimary(node) && node->numslots > 0 && + if (clusterNodeIsVotingPrimary(node) && (now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) { serverLog( LL_NOTICE, @@ -2090,8 +2090,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Ignore gossips about self. */ if (node && node != myself) { /* We already know this node. - Handle failure reports, only when the sender is a primary with slots. */ - if (sender && clusterNodeIsPrimary(sender) && sender->numslots) { + Handle failure reports, only when the sender is a voting primary. */ + if (sender && clusterNodeIsVotingPrimary(sender)) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { if (clusterNodeAddFailureReport(node, sender)) { serverLog(LL_VERBOSE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", @@ -3266,8 +3266,7 @@ int clusterProcessPacket(clusterLink *link) { /* We consider this vote only if the sender is a primary serving * a non zero number of slots, and its currentEpoch is greater or * equal to epoch where this node started the election. */ - if (clusterNodeIsPrimary(sender) && sender->numslots > 0 && - senderCurrentEpoch >= server.cluster->failover_auth_epoch) { + if (clusterNodeIsVotingPrimary(sender) && senderCurrentEpoch >= server.cluster->failover_auth_epoch) { server.cluster->failover_auth_count++; /* Maybe we reached a quorum here, set a flag to make sure * we check ASAP. */ @@ -4779,7 +4778,7 @@ void clusterCron(void) { if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) { node->flags |= CLUSTER_NODE_PFAIL; update_state = 1; - if (server.cluster->size == 1 && clusterNodeIsPrimary(myself) && myself->numslots) { + if (server.cluster->size == 1 && clusterNodeIsVotingPrimary(myself)) { markNodeAsFailingIfNeeded(node); } else { serverLog(LL_DEBUG, "*** NODE %.40s possibly failing", node->name); @@ -5049,7 +5048,7 @@ void clusterUpdateState(void) { while ((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); - if (clusterNodeIsPrimary(node) && node->numslots) { + if (clusterNodeIsVotingPrimary(node)) { server.cluster->size++; if ((node->flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) == 0) reachable_primaries++; } @@ -5807,6 +5806,14 @@ int clusterNodeIsPrimary(clusterNode *n) { return n->flags & CLUSTER_NODE_PRIMARY; } +/* Only primary that own slots have the voting rights. + * And only voting primary will be counted in cluster size. + * + * Returns 1 if the node has voting rights, otherwise returns 0. */ +int clusterNodeIsVotingPrimary(clusterNode *n) { + return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; +} + int handleDebugClusterCommand(client *c) { if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || strcasecmp(c->argv[2]->ptr, "KILL") || c->argc != 5) { return 0; From 1469477f2279462535858841971fdf3d7a2aa693 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 14 Jun 2024 12:29:01 +0800 Subject: [PATCH 4/6] static inline int Signed-off-by: Binbin --- src/cluster.h | 1 - src/cluster_legacy.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 540e5e95d9..f163e7f688 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -84,7 +84,6 @@ int getMyShardSlotCount(void); int handleDebugClusterCommand(client *c); int clusterNodePending(clusterNode *node); int clusterNodeIsPrimary(clusterNode *n); -int clusterNodeIsVotingPrimary(clusterNode *n); char **getClusterNodesList(size_t *numnodes); char *clusterNodeIp(clusterNode *node); int clusterNodeIsReplica(clusterNode *node); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index cdebe19993..23600a03be 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -69,6 +69,7 @@ int clusterDelSlot(int slot); int clusterDelNodeSlots(clusterNode *node); int clusterNodeSetSlotBit(clusterNode *n, int slot); void clusterSetPrimary(clusterNode *n, int closeSlots); +static inline int clusterNodeIsVotingPrimary(clusterNode *n); void clusterHandleReplicaFailover(void); void clusterHandleReplicaMigration(int max_replicas); int bitmapTestBit(unsigned char *bitmap, int pos); @@ -5810,7 +5811,7 @@ int clusterNodeIsPrimary(clusterNode *n) { * And only voting primary will be counted in cluster size. * * Returns 1 if the node has voting rights, otherwise returns 0. */ -int clusterNodeIsVotingPrimary(clusterNode *n) { +static inline int clusterNodeIsVotingPrimary(clusterNode *n) { return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; } From af2e92c4a1365511a2eb07edfcc6a43e9ccba4cc Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 14 Jun 2024 12:57:42 +0800 Subject: [PATCH 5/6] code review from Ping Signed-off-by: Binbin --- src/cluster_legacy.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 23600a03be..bba396048c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -69,7 +69,6 @@ int clusterDelSlot(int slot); int clusterDelNodeSlots(clusterNode *node); int clusterNodeSetSlotBit(clusterNode *n, int slot); void clusterSetPrimary(clusterNode *n, int closeSlots); -static inline int clusterNodeIsVotingPrimary(clusterNode *n); void clusterHandleReplicaFailover(void); void clusterHandleReplicaMigration(int max_replicas); int bitmapTestBit(unsigned char *bitmap, int pos); @@ -117,6 +116,14 @@ int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); +/* Only primary that own slots have the voting rights. + * And only voting primary will be counted in cluster size. + * + * Returns 1 if the node has voting rights, otherwise returns 0. */ +static inline int clusterNodeIsVotingPrimary(clusterNode *n) { + return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; +} + int getNodeDefaultClientPort(clusterNode *n) { return server.tls_cluster ? n->tls_port : n->tcp_port; } @@ -5807,14 +5814,6 @@ int clusterNodeIsPrimary(clusterNode *n) { return n->flags & CLUSTER_NODE_PRIMARY; } -/* Only primary that own slots have the voting rights. - * And only voting primary will be counted in cluster size. - * - * Returns 1 if the node has voting rights, otherwise returns 0. */ -static inline int clusterNodeIsVotingPrimary(clusterNode *n) { - return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; -} - int handleDebugClusterCommand(client *c) { if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || strcasecmp(c->argv[2]->ptr, "KILL") || c->argc != 5) { return 0; From acfdbb87300c473ffe1b404e6b812ec925804839 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 14 Jun 2024 14:13:37 +0800 Subject: [PATCH 6/6] Update src/cluster_legacy.c Signed-off-by: Binbin Co-authored-by: Ping Xie --- src/cluster_legacy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bba396048c..faeca2b508 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -116,9 +116,7 @@ int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); -/* Only primary that own slots have the voting rights. - * And only voting primary will be counted in cluster size. - * +/* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ static inline int clusterNodeIsVotingPrimary(clusterNode *n) { return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots;