Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a dedicated threadpool for node connections #30150

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/reference/cat/thread_pool.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ management
ml_autodetect (default distro only)
ml_datafeed (default distro only)
ml_utility (default distro only)
node_connections
refresh
rollup_indexing (default distro only)`
search
Expand Down
5 changes: 5 additions & 0 deletions docs/reference/modules/threadpool.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ There are several thread pools, but the important ones include:
Mainly for java client executing of action when listener threaded is set to true.
Thread pool type is `scaling` with a default max of `min(10, (# of available processors)/2)`.

`node_connections`::
For connecting to other nodes in the cluster. Thread pool type is `scaling` with a
keep-alive of `5m` and a max of `(# of available processors)*2` by default. For larger clusters,
you may need to manually increase the `max` size to adapt to a higher node connection intensity.

Changing a specific thread pool can be done by setting its type-specific parameters; for example, changing the `bulk`
thread pool to have more threads:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,12 @@ public void connectToNodes(DiscoveryNodes discoveryNodes) {
if (connected) {
latch.countDown();
} else {
// spawn to another thread to do in parallel
threadPool.executor(ThreadPool.Names.MANAGEMENT).execute(new AbstractRunnable() {
threadPool.executor(ThreadPool.Names.NODE_CONNECTIONS).execute(new AbstractRunnable() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the ConnectionChecker below could also reasonably run on the NODE_CONNECTIONS threadpool.

@Override
public void onFailure(Exception e) {
// both errors and rejections are logged here. the service
// will try again after `cluster.nodes.reconnect_interval` on all nodes but the current master.
// On the master, node fault detection will remove these nodes from the cluster as their are not
// On the master, node fault detection will remove these nodes from the cluster as they are not
// connected. Note that it is very rare that we end up here on the master.
logger.warn(() -> new ParameterizedMessage("failed to connect to {}", node), e);
}
Expand Down Expand Up @@ -185,14 +184,14 @@ protected void doRun() {
@Override
public void onAfter() {
if (lifecycle.started()) {
backgroundFuture = threadPool.schedule(reconnectInterval, ThreadPool.Names.GENERIC, this);
backgroundFuture = threadPool.schedule(reconnectInterval, ThreadPool.Names.NODE_CONNECTIONS, this);
}
}
}

@Override
protected void doStart() {
backgroundFuture = threadPool.schedule(reconnectInterval, ThreadPool.Names.GENERIC, new ConnectionChecker());
backgroundFuture = threadPool.schedule(reconnectInterval, ThreadPool.Names.NODE_CONNECTIONS, new ConnectionChecker());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.util.Counter;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.Version;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.component.AbstractComponent;
Expand All @@ -38,6 +37,7 @@
import org.elasticsearch.common.util.concurrent.XRejectedExecutionHandler;
import org.elasticsearch.common.xcontent.ToXContentFragment;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.node.Node;

import java.io.Closeable;
Expand Down Expand Up @@ -79,6 +79,7 @@ public static class Names {
public static final String FORCE_MERGE = "force_merge";
public static final String FETCH_SHARD_STARTED = "fetch_shard_started";
public static final String FETCH_SHARD_STORE = "fetch_shard_store";
public static final String NODE_CONNECTIONS = "node_connections";
}

public enum ThreadPoolType {
Expand Down Expand Up @@ -135,6 +136,7 @@ public static ThreadPoolType fromType(String type) {
map.put(Names.FORCE_MERGE, ThreadPoolType.FIXED);
map.put(Names.FETCH_SHARD_STARTED, ThreadPoolType.SCALING);
map.put(Names.FETCH_SHARD_STORE, ThreadPoolType.SCALING);
map.put(Names.NODE_CONNECTIONS, ThreadPoolType.SCALING);
THREAD_POOL_TYPES = Collections.unmodifiableMap(map);
}

Expand Down Expand Up @@ -186,6 +188,7 @@ public ThreadPool(final Settings settings, final ExecutorBuilder<?>... customBui
builders.put(Names.FETCH_SHARD_STARTED, new ScalingExecutorBuilder(Names.FETCH_SHARD_STARTED, 1, 2 * availableProcessors, TimeValue.timeValueMinutes(5)));
builders.put(Names.FORCE_MERGE, new FixedExecutorBuilder(settings, Names.FORCE_MERGE, 1, -1));
builders.put(Names.FETCH_SHARD_STORE, new ScalingExecutorBuilder(Names.FETCH_SHARD_STORE, 1, 2 * availableProcessors, TimeValue.timeValueMinutes(5)));
builders.put(Names.NODE_CONNECTIONS, new ScalingExecutorBuilder(Names.NODE_CONNECTIONS, 1, 2 * availableProcessors, TimeValue.timeValueMinutes(5)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels too small to me, for larger clusters. Connecting to a node is not a resource-intensive operation but it might take a long time to time out, blocking other connection attempts. Perhaps this is a reasonable default and we can note in the docs that larger clusters may prefer to increase this limit?

for (final ExecutorBuilder<?> builder : customBuilders) {
if (builders.containsKey(builder.name())) {
throw new IllegalArgumentException("builder with name [" + builder.name() + "] already exists");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.threadpool.TestThreadPool;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPoolStats;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.ConnectionProfile;
import org.elasticsearch.transport.Transport;
Expand All @@ -57,6 +58,7 @@
import java.util.concurrent.atomic.AtomicLong;

import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;

public class NodeConnectionsServiceTests extends ESTestCase {

Expand Down Expand Up @@ -103,6 +105,8 @@ public void testConnectAndDisconnect() {

service.disconnectFromNodesExcept(event.state().nodes());
assertConnectedExactlyToNodes(event.state());

assertUsingNodeConnectionThreadPool(threadPool);
}


Expand All @@ -129,6 +133,8 @@ public void testReconnect() {
transport.randomConnectionExceptions = false;
service.new ConnectionChecker().run();
assertConnectedExactlyToNodes(event.state());

assertUsingNodeConnectionThreadPool(threadPool);
}

private void assertConnectedExactlyToNodes(ClusterState state) {
Expand All @@ -148,6 +154,19 @@ private void assertNotConnected(Iterable<DiscoveryNode> nodes) {
}
}

/**
* Assert only {@link ThreadPool.Names#NODE_CONNECTIONS} threadpool has been used for node connection.
*/
private void assertUsingNodeConnectionThreadPool(ThreadPool threadPool) {
for (ThreadPoolStats.Stats stats : threadPool.stats()) {
if (stats.getName().equals(ThreadPool.Names.NODE_CONNECTIONS)) {
assertThat((int) stats.getCompleted(), greaterThan(0));
} else {
assertThat((int) stats.getCompleted(), equalTo(0));
}
}
}

@Override
@Before
public void setUp() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ private int expectedSize(final String threadPoolName, final int numberOfProcesso
sizes.put(ThreadPool.Names.SNAPSHOT, ThreadPool::halfNumberOfProcessorsMaxFive);
sizes.put(ThreadPool.Names.FETCH_SHARD_STARTED, ThreadPool::twiceNumberOfProcessors);
sizes.put(ThreadPool.Names.FETCH_SHARD_STORE, ThreadPool::twiceNumberOfProcessors);
sizes.put(ThreadPool.Names.NODE_CONNECTIONS, ThreadPool::twiceNumberOfProcessors);
return sizes.get(threadPoolName).apply(numberOfProcessors);
}

Expand Down