Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Polling cluster formation state for master-is-stable health indicator #88397

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
85eaacf
Polling cluster formation state
masseyke Jul 8, 2022
85323a4
cancelling all cancellables
masseyke Jul 8, 2022
dc3b581
fixing compilation errors
masseyke Jul 8, 2022
5d215f3
adding unit test
masseyke Jul 8, 2022
9cff8c7
spotlessApply
masseyke Jul 8, 2022
6eabf6d
simplifying
masseyke Jul 8, 2022
3e38a50
more cleanup
masseyke Jul 8, 2022
7ee8a03
more cleanup
masseyke Jul 8, 2022
6605e87
fixing ConcurrentModificationError
masseyke Jul 8, 2022
c5e7260
braking apart huge nested callbacks
masseyke Jul 11, 2022
27ebfd7
using StepListener
masseyke Jul 11, 2022
e14af5b
Unit testing
masseyke Jul 11, 2022
919b5f7
checkstyle
masseyke Jul 11, 2022
630a092
cleaning up
masseyke Jul 11, 2022
f407efc
cleaning up
masseyke Jul 11, 2022
be04f87
Update docs/changelog/88397.yaml
masseyke Jul 11, 2022
f760226
code review feedback
masseyke Jul 12, 2022
4c127d9
removing MultipleCancellablesWrapper(
masseyke Jul 12, 2022
0168cfb
improving comments
masseyke Jul 12, 2022
a282a0d
code review feedback
masseyke Jul 12, 2022
b7937a9
Merge branch 'master' into feature/polling-cluster-formation-state
elasticmachine Jul 12, 2022
9ef64e5
cleaning up
masseyke Jul 12, 2022
69099bb
cleaning up
masseyke Jul 12, 2022
963279f
committing working code
masseyke Jul 13, 2022
c86f450
committing working code
masseyke Jul 13, 2022
fb77e27
code review feedback
masseyke Jul 13, 2022
8cea8a5
cleanup
masseyke Jul 13, 2022
0144e60
code review feedback
masseyke Jul 14, 2022
e7e8949
Merge branch 'master' into feature/polling-cluster-formation-state
elasticmachine Jul 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/88397.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 88397
summary: Polling cluster formation state for master-is-stable health indicator
area: Health
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionListenerResponseHandler;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.action.admin.cluster.coordination.ClusterFormationInfoAction;
import org.elasticsearch.cluster.ClusterChangedEvent;
import org.elasticsearch.cluster.ClusterStateListener;
import org.elasticsearch.cluster.node.DiscoveryNode;
Expand All @@ -19,7 +24,14 @@
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.threadpool.Scheduler;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.ConnectionProfile;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportService;

import java.io.IOException;
import java.io.PrintWriter;
Expand All @@ -30,6 +42,9 @@
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

Expand All @@ -47,6 +62,7 @@
*/
public class CoordinationDiagnosticsService implements ClusterStateListener {
private final ClusterService clusterService;
private final TransportService transportService;
private final Coordinator coordinator;
private final MasterHistoryService masterHistoryService;
/**
Expand All @@ -63,6 +79,19 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
*/
private final int unacceptableIdentityChanges;

/*
* This is a list of tasks that are periodically reaching out to other master eligible nodes to get their ClusterFormationStates for
* diagnosis.
* This field is only ever accessed on the cluster change event thread, so there no need to protect it for thread safety.
*/
private List<Scheduler.Cancellable> clusterFormationInfoTasks = List.of();
/*
* This field holds the results of the tasks in the clusterFormationInfoTasks field above. The field is accessed (reads/writes) from
* multiple threads, but the reference itself is only ever changed on the cluster change event thread.
*/
// Non-private for testing
volatile ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = new ConcurrentHashMap<>();

private static final Logger logger = LogManager.getLogger(CoordinationDiagnosticsService.class);

/**
Expand Down Expand Up @@ -98,10 +127,12 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {

public CoordinationDiagnosticsService(
ClusterService clusterService,
TransportService transportService,
Coordinator coordinator,
MasterHistoryService masterHistoryService
) {
this.clusterService = clusterService;
this.transportService = transportService;
this.coordinator = coordinator;
this.masterHistoryService = masterHistoryService;
this.nodeHasMasterLookupTimeframe = NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.get(clusterService.getSettings());
Expand Down Expand Up @@ -410,6 +441,204 @@ public void clusterChanged(ClusterChangedEvent event) {
}
}
}
if (currentMaster == null && clusterService.localNode().isMasterNode()) {
/*
* This begins polling all master-eligible nodes for cluster formation information. However there's a 10-second delay before it
* starts, so in the normal situation where during a master transition it flips from master1 -> null -> master2, it the
* polling tasks will be canceled before any requests are actually made.
*/
beginPollingClusterFormationInfo();
} else {
cancelPollingClusterFormationInfo();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we shouldn't be constantly calling this cancel... when the cluster is healthy, but only when re-gaining a master.
@DaveCTurner what do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a huge deal. Maybe we could use null instead of new ConcurrentHashMap<>() to distinguish "we're not polling" from "we're polling but have no entries"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thinking was that it's basically a no-op (there won't be any scheduled tasks most of the time), and the miniscule performance hit was worth the risk of complicating the code and/or accidentally not calling it when it needed to be called.
I can have cancelPollingClusterFormationInfo set clusterFormationResponses to null and have beginPollingClusterFormationInfo create a new ConcurrentHashMap<>() (I think that's what you're suggesting?) -- that would save a little garbage collection.

}
}

private void beginPollingClusterFormationInfo() {
cancelPollingClusterFormationInfo();
clusterFormationInfoTasks = getMasterEligibleNodes().stream()
.map(masterNode -> beginPollingClusterFormationInfo(masterNode, clusterFormationResponses))
.collect(Collectors.toList());
}

private void cancelPollingClusterFormationInfo() {
clusterFormationInfoTasks.forEach(Scheduler.Cancellable::cancel);
/*
* Recreates the map so that we don't read old information, or worse get stuck with information about a node that has been
* removed from the cluster.
*/
clusterFormationResponses = new ConcurrentHashMap<>();
}

Scheduler.Cancellable beginPollingClusterFormationInfo(
// Non-private for testing
DiscoveryNode node,
final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap
) {
return new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap).pollUntilCancelled();
}

/*
* This inner class wraps the logic of polling a master-eligible node for its cluster formation information (which is needed in the
* event that the cluster cannot elect a master node).
*/
// Non-private for testing
class PollClusterFormationStateTask {
/**
* The node that is being polled
*/
private final DiscoveryNode node;
/**
* This is a reference to the global nodeToClusterFormationStateMap that was current at the time this object was constructed. The
* global map is recreated whenever the task is cancelled. Having this reference prevents accidental writes to that map after
* cancellation.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit confusing as it implies we're copying it (ie "that was current at the time this object was constructed"), but also "references" it ... which means it's going to change in line with the global map.

I'm also not sure about the accidental writes to "that" map (it's "this" map isn't it :) ?) nor what's accidental about the said writes.

Could we maybe talk about the why we have this map here as opposed to the implementation details of how it arrived here? (apologies if I'm misunderstanding its purpose)

Update: maybe we should drop PollClusterFormationStateTask altogether, in which case ignore the above :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Below is a scenario showing why I'm copying the clusterFormationResponses reference to and passing it in rather than using the global clusterFormationResponses. Maybe I'm being unnecessarily cautious.

We begin polling master eligible nodes A, B, C
We create a new global clusterFormationResponses map
We update the global clusterFormationResponses with results from A, B
We cancel polling
We begin polling master eligible nodes A, B, D (note that it's a different set)
We create a new global clusterFormationResponses map
We update the global clusterFormationResponses with results from C (the write was happening before cancel took effect and I don't think that it will interrupt a map insert but maybe I'm wrong)
We update the global clusterFormationResponses with results from A, B, D

So now we have result for A, B, C, and D, even though C is no longer part of the cluster. So we get weird results when we look to see if we can form a quorum, or if C knows about the existence of D. Or even if it's not as extreme as a node being replaced, we have results that might be out of date, causing confusion. My simple way to avoid worrying about all of this was to just pass the method a reference to the clusterFormationResponses map that it is supposed to be using.

*/
private final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap;
/**
* This is a wrapper Cancellable. After polling begins, every time a new remote request is scheduled (about once every 10
* seconds) we get a new Cancellable. This wraps all of them so that we only have to cancel the single Cancellable that is
* initially returned from pollUntilCancelled() in order to cancel them all.
*/
private final MultipleCancellablesWrapper multipleCancellablesWrapper;

/**
* This constructor is used to create the root task. It initializes the MultipleCancellablesWrapper that is shared between all
* the related tasks.
*
* @param node The node to poll for cluster formation information
* @param nodeToClusterFormationStateMap A reference to the global nodeToClusterFormationStateMap
*/
PollClusterFormationStateTask(
DiscoveryNode node,
final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap
) {
this(node, nodeToClusterFormationStateMap, new MultipleCancellablesWrapper());
}

private PollClusterFormationStateTask(
DiscoveryNode node,
final ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap,
MultipleCancellablesWrapper multipleCancellablesWrapper
) {
this.node = node;
this.nodeToClusterFormationStateMap = nodeToClusterFormationStateMap;
this.multipleCancellablesWrapper = multipleCancellablesWrapper;
}

/**
* This method returns a Cancellable quickly, but in the background schedules to query the remote node's cluster formation state
* in 10 seconds, and repeats doing that until cancel() is called on the returned Cancellable.
*
* @return
*/
public Scheduler.Cancellable pollUntilCancelled() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that untilCancelled in the method here is not adding much information? Shall we just call it fetchClusterFormationInfo ? It could also receive the node as parameter?

Whilst here - is there a need for PollClusterFormationStateTask and MultipleCancellablesWrapper to exist?
I think their wrapping of global state makes things a bit more difficult to understand.

Maybe the fetchClusterFormationInfo could also receive a Consumer<ClusterFormationInfoAction.Response> and call that whenever it wants to pass the response to the outside world?
Given it returns a Cancellable the caller could aggregate them all in a list and cancel them without needing a MultipleCancellablesWrapper to aggregate them internally?

the signature I'm proposing is:

Scheduler.Cancellable fetchClusterFormationInfo(DiscoveryNode node, Consumer<ClusterFormationInfoAction.Response> responseConsumer)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh good point about PollClusterFormationStateTask's existence. In an earlier incarnation this was an actual Runnable, and pollUntilCancelled() was just run(). I improved that, but forgot to move the state out of PollClusterFormationStateTask and then get rid of it altogether. I think the signature will have to be a little different though. More on that in an upcoming comment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the signature I have now:
void fetchClusterFormationInfo(DiscoveryNode node, ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationStateMap, List<Scheduler.Cancellable> cancellables)
I can't just return a single Cancellable because one call to the method can generate lots of Cancellables (since the method schedules calls of itself). So I'm passing in the array of Cancellables that the cancelPollingClusterFormationInfo acts on. This avoids having to have the MultipleCancellablesWrapper but it effectively does the same thing. And I passed in a nodeToClusterFormationStateMap because I can't have just a ClusterFormationInfoAction.Response Consumer because one possible outcome is an Exception rather than a Response. I'll see if making that a Consumer makes it more or less readable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK the method now accepts a Consumer<ClusterFormationStateOrException> instead of a ClusterFormationStateOrException> nodeToClusterFormationStateMap. I think it helps out a little bit. The functionality is unchanged.

Copy link
Contributor

@andreidan andreidan Jul 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a step in the right direction, thanks for implementing it !

I can't just return a single Cancellable because one call to the method can generate lots of Cancellables (since the method schedules calls of itself)

Would this be an avenue for simplification?

Say the method signature is as follows

Scheduler.Cancellable fetchClusterFormationInfo(DiscoveryNode node, Consumer<ClusterFormationInfoAction.Response> responseConsumer)

This means the caller is responsible for re-calling fetchClusterFormationInfo method when the responseConsumer is called.

I think this would remove the need for the runAfter call in that runAfter(runBefore sequence because it seems we'll want to reschedule irrespective of being able to get the response or not - so when responseConsumer is called we should be rescheduling.
ie. we always call responseConsumer irrespective of how fetchClusterInfoListener completes:

                new ActionListenerResponseHandler<>(
                    ActionListener.runBefore(fetchClusterInfoListener, () -> Releasables.close(releasable)),
                    ClusterFormationInfoAction.Response::new
                )

...

fetchClusterInfoListener.whenComplete(response -> {
            long endTime = System.nanoTime();
            logger.trace("Received cluster coordination info from {} in {}", node, TimeValue.timeValueNanos(endTime - startTime));
            responseConsumer.accept(new ClusterFormationStateOrException(response.getClusterFormationState()));
        }, e -> {
            logger.warn("Exception in cluster coordination info request to master node", e);
            responseConsumer.accept(new ClusterFormationStateOrException(e));
        });

Consuming the fetchClusterFormationInfo would look something along the lines of:

        getMasterEligibleNodes().forEach(masterNode -> {
            Consumer<ClusterFormationStateOrException> responseConsumer = response -> {
                if (clusterFormationResponses != null) {
                    clusterFormationResponses.put(masterNode, response);
                }
            };
            
            Scheduler.ScheduledCancellable scheduleFetch = fetchClusterFormationInfo(masterNode, responseConsumer.andThen(response -> {
                if (clusterFormationInfoTasks != null) {
                    // reschedule the fetch if it wasn't cancelled already
                    clusterFormationInfoTasks.add(fetchClusterFormationInfo(masterNode, responseConsumer));
                }
            }));
            
            clusterFormationInfoTasks.add(scheduleFetch);
        });

This would reduce the scope of fetchClusterFormationInfo and the need for a Consumer<Cancellable>.

What do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless I'm misunderstanding something, we can't use a Consumer<ClusterFormationInfoAction.Response> here because then we'd have no way of handling exceptions, right? And we need to keep track of exceptions we encounter.
And even using a Consumer<ClusterFormationStateOrException> I don't think we can take the approach you're suggesting will work as-is because it misses retries on a connectToNode exception (https://github.com/elastic/elasticsearch/pull/88397/files#diff-6898ba5666bbb351e267e9295e8feb7fbabf0d95d77aa4375c06cb2c16ad0a7dR529). I could probably add in a separate consumer for connectToNode, but at that point it's starting to get even more complex. Is there any use case that is missed by the current code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also I think your proposal exposes us to the race condition here right (although I'm not 100% sure that causes big problems)? #88397 (comment)
But that is fixable.

Copy link
Contributor

@andreidan andreidan Jul 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And even using a Consumer I don't think we can take the approach you're suggesting will work as-is because it misses retries on a connectToNode exception (https://github.com/elastic/elasticsearch/pull/88397/files#diff-6898ba5666bbb351e267e9295e8feb7fbabf0d95d77aa4375c06cb2c16ad0a7dR529)

Ah apologies I did mean Consumer<ClusterFormationStateOrException>. But if the connectToNode raises an exception the response consumer will still be called https://github.com/elastic/elasticsearch/pull/88397/files#diff-6898ba5666bbb351e267e9295e8feb7fbabf0d95d77aa4375c06cb2c16ad0a7dR524 (and the responseConsumer will track the exception and reschedule the fetch). Am I misisng something?

Is there any use case that is missed by the current code?

I think it's a bit difficult to follow, that's why I suggested reducing the scope of fetch... and have a clear view over which callback reschedules the polling and when that happens. I'd argue that if the suggestion I proposed above works (I might be missing something) it'll be clear from the method signature that fetch... schedules one Cancellable task and has one consumer that gets a success or an exception (which could be chained to reschedule the fetch via andThen...)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I was mistaken about where it was failing. As we discussed offline, the actual reason it was failing was because the code at #88397 (comment) would stop polling after 2 attempts. We're now doing something very similar to that, but recursive so that it continues polling until cancelled.
We also discussed offline that I think the code above could throw NullPointerExceptions since clusterFormationInfoTasks and clusterFormationResponses could become null in between the null check and using them. Rather than synchronizing that code, I've put both of those references on the stack so that they're never null.

StepListener<Releasable> connectionListener = new StepListener<>();
StepListener<ClusterFormationInfoAction.Response> clusterFormationInfoResponseListener = new StepListener<>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this be more readable if named fetchClusterInfoListener ? The Response bit is a bit redundant maybe?

long startTime = System.nanoTime();
connectionListener.whenComplete(releasable -> {
logger.trace("Opened connection to {}, making cluster coordination info request", node);
// If we don't get a response in 10 seconds that is a failure worth capturing on its own:
final TimeValue transportTimeout = TimeValue.timeValueSeconds(10);
transportService.sendRequest(
node,
ClusterFormationInfoAction.NAME,
new ClusterFormationInfoAction.Request(),
TransportRequestOptions.timeout(transportTimeout),
new ActionListenerResponseHandler<>(
ActionListener.runAfter(
ActionListener.runBefore(clusterFormationInfoResponseListener, () -> Releasables.close(releasable)),
() -> new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap, multipleCancellablesWrapper)
.pollUntilCancelled()
),
ClusterFormationInfoAction.Response::new
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be simplified?

Even if just documenting the order of execution?

ie

close(releasable) -> clusterFormationInfoResponseListener ->  () -> new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap, multipleCancellablesWrapper).pollUntilCancelled()

Hmm, actually is the runAfter needed still?

Could the last step (() -> new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap, multipleCancellablesWrapper).pollUntilCancelled() ) be executed in the clusterFormationInfoResponseListener.whenComplete callback?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the last step (...) be executed in the clusterFormationInfoResponseListener.whenComplete callback?

It could, but it would have to be done twice (once in the success path and once in the fail path). The nice thing about runAfter and runBefore is that they happen either way (success or failure). I can add in a comment documenting how that works though.

)
);
}, e -> {
logger.warn("Exception connecting to master node", e);
nodeToClusterFormationStateMap.put(node, new ClusterFormationStateOrException(e));
/*
* Note: We can't call pollUntilCancelled() in a runAfter() in this case because when the corresponding
* onResponse() is called we actually aren't finished yet (because it makes another asynchronous request).
*/
new PollClusterFormationStateTask(node, nodeToClusterFormationStateMap, multipleCancellablesWrapper).pollUntilCancelled();
});

clusterFormationInfoResponseListener.whenComplete(response -> {
long endTime = System.nanoTime();
logger.trace("Received cluster coordination info from {} in {}", node, TimeValue.timeValueNanos(endTime - startTime));
nodeToClusterFormationStateMap.put(node, new ClusterFormationStateOrException(response.getClusterFormationState()));
}, e -> {
logger.warn("Exception in cluster coordination info request to master node", e);
nodeToClusterFormationStateMap.put(node, new ClusterFormationStateOrException(e));
});

Scheduler.ScheduledCancellable scheduledCancellable = transportService.getThreadPool().schedule(() -> {
Version minSupportedVersion = Version.V_8_4_0;
if (node.getVersion().onOrAfter(minSupportedVersion) == false) { // This was introduced in 8.4.0
logger.trace(
"Cannot get cluster coordination info for {} because it is at version {} and {} is required",
node,
node.getVersion(),
minSupportedVersion
);
} else {
transportService.connectToNode(
// Note: This connection must be explicitly closed in the connectionListener
node,
ConnectionProfile.buildDefaultConnectionProfile(clusterService.getSettings()),
connectionListener
);
}
}, new TimeValue(10, TimeUnit.SECONDS), ThreadPool.Names.SAME);
multipleCancellablesWrapper.addNewCancellable(scheduledCancellable);
return multipleCancellablesWrapper;
}

/**
* This class represents a collection of related Cancellables. If one is cancelled, they are all considered cancelled. If cancel()
* is called on this method, then cancel() is called on all child Cancellables.
*/
static class MultipleCancellablesWrapper implements Scheduler.Cancellable {
/*
* This field will be read from and written to on multiple threads. CopyOnWriteArrayList is used here to avoid explicitly
* synchronizing access and to avoid ConcurrentModificationExceptions when iterating through the delegates.
*/
private final List<Scheduler.Cancellable> delegates = new CopyOnWriteArrayList<>();

@Override
public boolean cancel() {
delegates.forEach(Scheduler.Cancellable::cancel);
return true;
}

@Override
public boolean isCancelled() {
return delegates.stream().anyMatch(Scheduler.Cancellable::isCancelled);
}

public void addNewCancellable(Scheduler.Cancellable cancellable) {
delegates.add(cancellable);
}
}
}

// Non-private for testing
record ClusterFormationStateOrException(
ClusterFormationFailureHelper.ClusterFormationState clusterFormationState,
Exception exception
) {
ClusterFormationStateOrException {
if (clusterFormationState != null && exception != null) {
throw new IllegalArgumentException("Cluster formation state and exception cannot both be non-null");
}
}

ClusterFormationStateOrException(ClusterFormationFailureHelper.ClusterFormationState clusterFormationState) {
this(clusterFormationState, null);
}

ClusterFormationStateOrException(Exception exception) {
this(null, exception);
}
}

public record CoordinationDiagnosticsResult(
Expand Down
1 change: 1 addition & 0 deletions server/src/main/java/org/elasticsearch/node/Node.java
Original file line number Diff line number Diff line change
Expand Up @@ -917,6 +917,7 @@ protected Node(
MasterHistoryService masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService);
CoordinationDiagnosticsService coordinationDiagnosticsService = new CoordinationDiagnosticsService(
clusterService,
transportService,
discoveryModule.getCoordinator(),
masterHistoryService
);
Expand Down
Loading