elastic · masseyke · Jul 14, 2022 · Jul 8, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/docs/changelog/88397.yaml b/docs/changelog/88397.yaml
@@ -0,0 +1,5 @@
+pr: 88397
+summary: Polling cluster formation state for master-is-stable health indicator
+area: Health
+type: enhancement
+issues: []
diff --git a/.../src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java b/.../src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java
@@ -10,6 +10,11 @@
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.elasticsearch.Version;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.ActionListenerResponseHandler;
+import org.elasticsearch.action.StepListener;
+import org.elasticsearch.action.admin.cluster.coordination.ClusterFormationInfoAction;
 import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterStateListener;
 import org.elasticsearch.cluster.node.DiscoveryNode;
@@ -19,7 +24,14 @@
 import org.elasticsearch.common.io.stream.Writeable;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.core.Nullable;
+import org.elasticsearch.core.Releasable;
+import org.elasticsearch.core.Releasables;
 import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.threadpool.Scheduler;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.ConnectionProfile;
+import org.elasticsearch.transport.TransportRequestOptions;
+import org.elasticsearch.transport.TransportService;
 
 import java.io.IOException;
 import java.io.PrintWriter;
@@ -30,6 +42,9 @@
 import java.util.Locale;
 import java.util.Objects;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
@@ -47,6 +62,7 @@
  */
 public class CoordinationDiagnosticsService implements ClusterStateListener {
     private final ClusterService clusterService;
+    private final TransportService transportService;
     private final Coordinator coordinator;
     private final MasterHistoryService masterHistoryService;
     /**
@@ -63,6 +79,19 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
      */
     private final int unacceptableIdentityChanges;
 
+    /*
+     * This is a list of tasks that are periodically reaching out to other master eligible nodes to get their ClusterFormationStates for
+     * diagnosis.
+     * This field is only ever accessed on the cluster change event thread, so there no need to protect it for thread safety.
+     */
+    private List<Scheduler.Cancellable> clusterFormationInfoTasks = List.of();
+    /*
+     * This field holds the results of the tasks in the clusterFormationInfoTasks field above. The field is accessed (reads/writes) from
+     * multiple threads, but the reference itself is only ever changed on the cluster change event thread.
+     */
+    // Non-private for testing
+    volatile ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = new ConcurrentHashMap<>();
+
     private static final Logger logger = LogManager.getLogger(CoordinationDiagnosticsService.class);
 
     /**
@@ -98,10 +127,12 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
 
     public CoordinationDiagnosticsService(
         ClusterService clusterService,
+        TransportService transportService,
         Coordinator coordinator,
         MasterHistoryService masterHistoryService
     ) {
         this.clusterService = clusterService;
+        this.transportService = transportService;
         this.coordinator = coordinator;
         this.masterHistoryService = masterHistoryService;
         this.nodeHasMasterLookupTimeframe = NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.get(clusterService.getSettings());
@@ -410,6 +441,204 @@ public void clusterChanged(ClusterChangedEvent event) {
                 }
             }
         }
+        if (currentMaster == null && clusterService.localNode().isMasterNode()) {
+            /*
+             * This begins polling all master-eligible nodes for cluster formation information. However there's a 10-second delay before it
+             * starts, so in the normal situation where during a master transition it flips from master1 -> null -> master2, it the
+             * polling tasks will be canceled before any requests are actually made.
+             */
+            beginPollingClusterFormationInfo();
+        } else {
+            cancelPollingClusterFormationInfo();
+        }
+    }
+
+    private void beginPollingClusterFormationInfo() {
+        cancelPollingClusterFormationInfo();
+        clusterFormationInfoTasks = getMasterEligibleNodes().stream()
+            .map(masterNode -> beginPollingClusterFormationInfo(masterNode, clusterFormationResponses))
+            .collect(Collectors.toList());
+    }
+
+    private void cancelPollingClusterFormationInfo() {
+        clusterFormationInfoTasks.forEach(Scheduler.Cancellable::cancel);
+        /*
+         * Recreates the map so that we don't read old information, or worse get stuck with information about a node that has been
+         * removed from the cluster.
+         */
+        clusterFormationResponses = new ConcurrentHashMap<>();
+    }
+
+    Scheduler.Cancellable beginPollingClusterFormationInfo(
+        // Non-private for testing
+        DiscoveryNode node,
+        final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap
+    ) {
+        return new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap).pollUntilCancelled();
+    }
+
+    /*
+     * This inner class wraps the logic of polling a master-eligible node for its cluster formation information (which is needed in the
+     * event that the cluster cannot elect a master node).
+     */
+    // Non-private for testing
+    class PollClusterFormationStateTask {
+        /**
+         * The node that is being polled
+         */
+        private final DiscoveryNode node;
+        /**
+         * This is a reference to the global nodeToClusterFormationStateMap that was current at the time this object was constructed. The
+         * global map is recreated whenever the task is cancelled. Having this reference prevents accidental writes to that map after
+         * cancellation.
+         */
+        private final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap;
+        /**
+         * This is a wrapper Cancellable. After polling begins, every time a new remote request is scheduled (about once every 10
+         * seconds) we get a new Cancellable. This wraps all of them so that we only have to cancel the single Cancellable that is
+         * initially returned from pollUntilCancelled() in order to cancel them all.
+         */
+        private final MultipleCancellablesWrapper multipleCancellablesWrapper;
+
+        /**
+         * This constructor is used to create the root task. It initializes the MultipleCancellablesWrapper that is shared between all
+         * the related tasks.
+         *
+         * @param node The node to poll for cluster formation information
+         * @param nodeToClusterFormationStateMap A reference to the global nodeToClusterFormationStateMap
+         */
+        PollClusterFormationStateTask(
+            DiscoveryNode node,
+            final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap
+        ) {
+            this(node, nodeToClusterFormationStateMap, new MultipleCancellablesWrapper());
+        }
+
+        private PollClusterFormationStateTask(
+            DiscoveryNode node,
+            final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap,
+            MultipleCancellablesWrapper multipleCancellablesWrapper
+        ) {
+            this.node = node;
+            this.nodeToClusterFormationStateMap = nodeToClusterFormationStateMap;
+            this.multipleCancellablesWrapper = multipleCancellablesWrapper;
+        }
+
+        /**
+         * This method returns a Cancellable quickly, but in the background schedules to query the remote node's cluster formation state
+         * in 10 seconds, and repeats doing that until cancel() is called on the returned Cancellable.
+         *
+         * @return
+         */
+        public Scheduler.Cancellable pollUntilCancelled() {
+            StepListener<Releasable> connectionListener = new StepListener<>();
+            StepListener<ClusterFormationInfoAction.Response> clusterFormationInfoResponseListener = new StepListener<>();
+            long startTime = System.nanoTime();
+            connectionListener.whenComplete(releasable -> {
+                logger.trace("Opened connection to {}, making cluster coordination info request", node);
+                // If we don't get a response in 10 seconds that is a failure worth capturing on its own:
+                final TimeValue transportTimeout = TimeValue.timeValueSeconds(10);
+                transportService.sendRequest(
+                    node,
+                    ClusterFormationInfoAction.NAME,
+                    new ClusterFormationInfoAction.Request(),
+                    TransportRequestOptions.timeout(transportTimeout),
+                    new ActionListenerResponseHandler<>(
+                        ActionListener.runAfter(
+                            ActionListener.runBefore(clusterFormationInfoResponseListener, () -> Releasables.close(releasable)),
+                            () -> new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap, multipleCancellablesWrapper)
+                                .pollUntilCancelled()
+                        ),
+                        ClusterFormationInfoAction.Response::new
+                    )
+                );
+            }, e -> {
+                logger.warn("Exception connecting to master node", e);
+                nodeToClusterFormationStateMap.put(node, new ClusterFormationStateOrException(e));
+                /*
+                 * Note: We can't call pollUntilCancelled() in a runAfter() in this case because when the corresponding
+                 * onResponse() is called we actually aren't finished yet (because it makes another asynchronous request).
+                 */
+                new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap, multipleCancellablesWrapper).pollUntilCancelled();
+            });
+
+            clusterFormationInfoResponseListener.whenComplete(response -> {
+                long endTime = System.nanoTime();
+                logger.trace("Received cluster coordination info from {} in {}", node, TimeValue.timeValueNanos(endTime - startTime));
+                nodeToClusterFormationStateMap.put(node, new ClusterFormationStateOrException(response.getClusterFormationState()));
+            }, e -> {
+                logger.warn("Exception in cluster coordination info request to master node", e);
+                nodeToClusterFormationStateMap.put(node, new ClusterFormationStateOrException(e));
+            });
+
+            Scheduler.ScheduledCancellable scheduledCancellable = transportService.getThreadPool().schedule(() -> {
+                Version minSupportedVersion = Version.V_8_4_0;
+                if (node.getVersion().onOrAfter(minSupportedVersion) == false) { // This was introduced in 8.4.0
+                    logger.trace(
+                        "Cannot get cluster coordination info for {} because it is at version {} and {} is required",
+                        node,
+                        node.getVersion(),
+                        minSupportedVersion
+                    );
+                } else {
+                    transportService.connectToNode(
+                        // Note: This connection must be explicitly closed in the connectionListener
+                        node,
+                        ConnectionProfile.buildDefaultConnectionProfile(clusterService.getSettings()),
+                        connectionListener
+                    );
+                }
+            }, new TimeValue(10, TimeUnit.SECONDS), ThreadPool.Names.SAME);
+            multipleCancellablesWrapper.addNewCancellable(scheduledCancellable);
+            return multipleCancellablesWrapper;
+        }
+
+        /**
+         * This class represents a collection of related Cancellables. If one is cancelled, they are all considered cancelled. If cancel()
+         * is called on this method, then cancel() is called on all child Cancellables.
+         */
+        static class MultipleCancellablesWrapper implements Scheduler.Cancellable {
+            /*
+             * This field will be read from and written to on multiple threads. CopyOnWriteArrayList is used here to avoid explicitly
+             * synchronizing access and to avoid ConcurrentModificationExceptions when iterating through the delegates.
+             */
+            private final List<Scheduler.Cancellable> delegates = new CopyOnWriteArrayList<>();
+
+            @Override
+            public boolean cancel() {
+                delegates.forEach(Scheduler.Cancellable::cancel);
+                return true;
+            }
+
+            @Override
+            public boolean isCancelled() {
+                return delegates.stream().anyMatch(Scheduler.Cancellable::isCancelled);
+            }
+
+            public void addNewCancellable(Scheduler.Cancellable cancellable) {
+                delegates.add(cancellable);
+            }
+        }
+    }
+
+    // Non-private for testing
+    record ClusterFormationStateOrException(
+        ClusterFormationFailureHelper.ClusterFormationState clusterFormationState,
+        Exception exception
+    ) {
+        ClusterFormationStateOrException {
+            if (clusterFormationState != null && exception != null) {
+                throw new IllegalArgumentException("Cluster formation state and exception cannot both be non-null");
+            }
+        }
+
+        ClusterFormationStateOrException(ClusterFormationFailureHelper.ClusterFormationState clusterFormationState) {
+            this(clusterFormationState, null);
+        }
+
+        ClusterFormationStateOrException(Exception exception) {
+            this(null, exception);
+        }
     }
 
     public record CoordinationDiagnosticsResult(

diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java
@@ -917,6 +917,7 @@ protected Node(
             MasterHistoryService masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService);
             CoordinationDiagnosticsService coordinationDiagnosticsService = new CoordinationDiagnosticsService(
                 clusterService,
+                transportService,
                 discoveryModule.getCoordinator(),
                 masterHistoryService
             );