From 879e26ec067280c461449eae9902bf822c10c937 Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 20 Mar 2020 19:01:05 +0000 Subject: [PATCH] Describe STALE_STATE_CONFIG in ClusterFormationFH (#53878) We mark cluster states persisted on master-ineligible nodes as potentially-stale using the voting configuration `{STALE_STATE_CONFIG}` which prevents these nodes from being elected as master if they are restarted as master-eligible. Today we do not handle this special voting configuration differently in the `ClusterFormationFailureHandler`, leading to a mysterious message `an election requires a node with id [STALE_STATE_CONFIG]` if the election does not succeed. This commit adds a special case description for this situation to explain better why this node cannot win an election. Closes #53734 --- .../coordination/ClusterFormationFailureHelper.java | 8 +++++++- .../org/elasticsearch/gateway/GatewayMetaState.java | 9 ++++++++- .../ClusterFormationFailureHelperTests.java | 10 ++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java index a4f31fb3ab168..ee9d3e0468d88 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java @@ -31,6 +31,7 @@ import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.AbstractRunnable; +import org.elasticsearch.gateway.GatewayMetaState; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.threadpool.ThreadPool.Names; @@ -210,7 +211,12 @@ private String describeQuorum(VotingConfiguration votingConfiguration) { assert requiredNodes <= realNodeIds.size() : nodeIds; if (nodeIds.size() == 1) { - return "a node with id " + realNodeIds; + if (nodeIds.contains(GatewayMetaState.STALE_STATE_CONFIG_NODE_ID)) { + return "one or more nodes that have already participated as master-eligible nodes in the cluster but this node was " + + "not master-eligible the last time it joined the cluster"; + } else { + return "a node with id " + realNodeIds; + } } else if (nodeIds.size() == 2) { return "two nodes with ids " + realNodeIds; } else { diff --git a/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java b/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java index 8ac1b73ae5df5..3fab416f73c34 100644 --- a/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java +++ b/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java @@ -81,6 +81,13 @@ */ public class GatewayMetaState implements Closeable { + /** + * Fake node ID for a voting configuration written by a master-ineligible data node to indicate that its on-disk state is potentially + * stale (since it is written asynchronously after application, rather than before acceptance). This node ID means that if the node is + * restarted as a master-eligible node then it does not win any elections until it has received a fresh cluster state. + */ + public static final String STALE_STATE_CONFIG_NODE_ID = "STALE_STATE_CONFIG"; + // Set by calling start() private final SetOnce persistedState = new SetOnce<>(); @@ -425,7 +432,7 @@ protected void doRun() { } static final CoordinationMetaData.VotingConfiguration staleStateConfiguration = - new CoordinationMetaData.VotingConfiguration(Collections.singleton("STALE_STATE_CONFIG")); + new CoordinationMetaData.VotingConfiguration(Collections.singleton(STALE_STATE_CONFIG_NODE_ID)); static ClusterState resetVotingConfiguration(ClusterState clusterState) { CoordinationMetaData newCoordinationMetaData = CoordinationMetaData.builder(clusterState.coordinationMetaData()) diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java index ec115265652d0..dcea2b1118b80 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java @@ -29,6 +29,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.TransportAddress; +import org.elasticsearch.gateway.GatewayMetaState; import org.elasticsearch.test.ESTestCase; import java.util.Arrays; @@ -412,5 +413,14 @@ public void testDescriptionAfterBootstrapping() { "have discovered [] which is not a quorum; " + "discovery will continue using [] from hosts providers and [" + otherMasterNode + ", " + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"))); + + assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(), + emptyList(), 0L, electionStrategy).getDescription(), + is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " + + "master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " + + "have discovered [] which is not a quorum; " + + "discovery will continue using [] from hosts providers and [" + localNode + + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0")); + } }