elastic · dakrone · Aug 31, 2017 · Jun 29, 2017 · Aug 10, 2017 · Aug 11, 2017
diff --git a/core/src/main/java/org/elasticsearch/action/search/SearchExecutionStatsCollector.java b/core/src/main/java/org/elasticsearch/action/search/SearchExecutionStatsCollector.java
@@ -61,7 +61,7 @@ public void onResponse(SearchPhaseResult response) {
             final int queueSize = queryResult.nodeQueueSize();
             final long responseDuration = System.nanoTime() - startNanos;
             // EWMA/queue size may be -1 if the query node doesn't support capturing it
-            if (serviceTimeEWMA > 0 && queueSize > 0) {
+            if (serviceTimeEWMA > 0 && queueSize >= 0) {
                 collector.addNodeStatistics(nodeId, queueSize, responseDuration, serviceTimeEWMA);
             }
         }

diff --git a/core/src/main/java/org/elasticsearch/action/search/SearchTransportService.java b/core/src/main/java/org/elasticsearch/action/search/SearchTransportService.java
@@ -57,6 +57,7 @@
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
+import java.util.Map;
 import java.util.function.BiFunction;
 import java.util.function.Supplier;
 
@@ -193,6 +194,13 @@ public RemoteClusterService getRemoteClusterService() {
         return transportService.getRemoteClusterService();
     }
 
+    /**
+     * Return a map of nodeId to pending number of requests for the given action name
+     */
+    public Map<String, Long> getPendingRequests(final String actionName) {
+        return transportService.getPendingRequests(actionName);
+    }
+
     static class ScrollFreeContextRequest extends TransportRequest {
         private long id;
 

diff --git a/core/src/main/java/org/elasticsearch/action/search/TransportSearchAction.java b/core/src/main/java/org/elasticsearch/action/search/TransportSearchAction.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.action.OriginalIndices;
 import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsGroup;
 import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse;
+import org.elasticsearch.action.search.SearchAction;
 import org.elasticsearch.action.support.ActionFilters;
 import org.elasticsearch.action.support.HandledTransportAction;
 import org.elasticsearch.cluster.ClusterState;
@@ -284,8 +285,9 @@ private void executeSearch(SearchTask task, SearchTimeProvider timeProvider, Sea
         for (int i = 0; i < indices.length; i++) {
             concreteIndices[i] = indices[i].getName();
         }
+        Map<String, Long> nodeSearchCounts = searchTransportService.getPendingRequests(SearchAction.NAME);
         GroupShardsIterator<ShardIterator> localShardsIterator = clusterService.operationRouting().searchShards(clusterState,
-            concreteIndices, routingMap, searchRequest.preference());
+                concreteIndices, routingMap, searchRequest.preference(), searchService.getResponseCollectorService(), nodeSearchCounts);
         GroupShardsIterator<SearchShardIterator> shardIterators = mergeShardsIterators(localShardsIterator, localIndices,
             remoteShardIterators);
 

diff --git a/core/src/main/java/org/elasticsearch/cluster/routing/IndexShardRoutingTable.java b/core/src/main/java/org/elasticsearch/cluster/routing/IndexShardRoutingTable.java
@@ -29,18 +29,24 @@
 import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.node.ResponseCollectorService;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Optional;
+import java.util.OptionalDouble;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 import static java.util.Collections.emptyMap;
 
@@ -261,6 +267,142 @@ public ShardIterator activeInitializingShardsIt(int seed) {
         return new PlainShardIterator(shardId, ordered);
     }
 
+    /**
+     * Returns an iterator over active and initializing shards, ordered by the adaptive replica
+     * selection forumla. Making sure though that its random within the active shards of the same
+     * (or missing) rank, and initializing shards are the last to iterate through.
+     */
+    public ShardIterator rankedActiveInitializingShardsIt(@Nullable ResponseCollectorService collector,
+                                                          @Nullable Map<String, Long> nodeSearchCounts) {
+        final int seed = shuffler.nextSeed();
+        if (allInitializingShards.isEmpty()) {
+            return new PlainShardIterator(shardId, rank(shuffler.shuffle(activeShards, seed), collector, nodeSearchCounts));
+        }
+
+        ArrayList<ShardRouting> ordered = new ArrayList<>(activeShards.size() + allInitializingShards.size());
+        List<ShardRouting> rankedActiveShards = rank(shuffler.shuffle(activeShards, seed), collector, nodeSearchCounts);
+        ordered.addAll(rankedActiveShards);
+        List<ShardRouting> rankedInitializingShards = rank(allInitializingShards, collector, nodeSearchCounts);
+        ordered.addAll(rankedInitializingShards);
+        return new PlainShardIterator(shardId, ordered);
+    }
+
+    private static Set<String> getAllNodeIds(final List<ShardRouting> shards) {
+        final Set<String> nodeIds = new HashSet<>();
+        for (ShardRouting shard : shards) {
+            nodeIds.add(shard.currentNodeId());
+        }
+        return nodeIds;
+    }
+
+    private static Map<String, Optional<ResponseCollectorService.ComputedNodeStats>>
+        getNodeStats(final Set<String> nodeIds, final ResponseCollectorService collector) {
+
+        final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats = new HashMap<>(nodeIds.size());
+        for (String nodeId : nodeIds) {
+            nodeStats.put(nodeId, collector.getNodeStatistics(nodeId));
+        }
+        return nodeStats;
+    }
+
+    private static Map<String, Double> rankNodes(final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats,
+                                                 final Map<String, Long> nodeSearchCounts) {
+        final Map<String, Double> nodeRanks = new HashMap<>(nodeStats.size());
+        for (Map.Entry<String, Optional<ResponseCollectorService.ComputedNodeStats>> entry : nodeStats.entrySet()) {
+            entry.getValue().ifPresent(stats -> {
+                nodeRanks.put(entry.getKey(), stats.rank(nodeSearchCounts.getOrDefault(entry.getKey(), 1L)));
+            });
+        }
+        return nodeRanks;
+    }
+
+    private static void adjustStats(final ResponseCollectorService collector,
+                                    final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats,
+                                    final String minNodeId,
+                                    final ResponseCollectorService.ComputedNodeStats minStats) {
+        if (minNodeId != null) {
+            for (Map.Entry<String, Optional<ResponseCollectorService.ComputedNodeStats>> entry : nodeStats.entrySet()) {
+                if (entry.getKey().equals(minNodeId) == false && entry.getValue().isPresent()) {
+                    final ResponseCollectorService.ComputedNodeStats stats = entry.getValue().get();
+                    final int updatedQueue = (minStats.queueSize + stats.queueSize) / 2;
+                    final long updatedResponse = (long) (minStats.responseTime + stats.responseTime) / 2;
+                    final long updatedService = (long) (minStats.serviceTime + stats.serviceTime) / 2;
+                    collector.addNodeStatistics(stats.nodeId, updatedQueue, updatedResponse, updatedService);
+                }
+            }
+        }
+    }
+
+    private static List<ShardRouting> rank(List<ShardRouting> shards, final ResponseCollectorService collector,
+                                           final Map<String, Long> nodeSearchCounts) {
+        if (collector == null || nodeSearchCounts == null || shards.size() <= 1) {
+            return shards;
+        }
+
+        // Retrieve which nodes we can potentially send the query to
+        final Set<String> nodeIds = getAllNodeIds(shards);
+        final int nodeCount = nodeIds.size();
+
+        final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats = getNodeStats(nodeIds, collector);
+
+        // Retrieve all the nodes the shards exist on
+        final Map<String, Double> nodeRanks = rankNodes(nodeStats, nodeSearchCounts);
+
+        String minNode = null;
+        ResponseCollectorService.ComputedNodeStats minStats = null;
+        // calculate the "winning" node and its stats (for adjusting other nodes later)
+        for (Map.Entry<String, Optional<ResponseCollectorService.ComputedNodeStats>> entry : nodeStats.entrySet()) {
+            if (entry.getValue().isPresent()) {
+                ResponseCollectorService.ComputedNodeStats stats = entry.getValue().get();
+                double rank = stats.rank(nodeSearchCounts.getOrDefault(entry.getKey(), 1L));
+                if (minStats == null || rank < minStats.rank(nodeSearchCounts.getOrDefault(minStats.nodeId, 1L))) {
+                    minStats = stats;
+                    minNode = entry.getKey();
+                }
+            }
+        }
+
+        // sort all shards based on the shard rank
+        ArrayList<ShardRouting> sortedShards = new ArrayList<>(shards);
+        Collections.sort(sortedShards, new NodeRankComparator(nodeRanks));
+
+        // adjust the non-winner nodes' stats so they will get a chance to receive queries
+        adjustStats(collector, nodeStats, minNode, minStats);
+
+        return sortedShards;
+    }
+
+    private static class NodeRankComparator implements Comparator<ShardRouting> {
+        private final Map<String, Double> nodeRanks;
+
+        NodeRankComparator(Map<String, Double> nodeRanks) {
+            this.nodeRanks = nodeRanks;
+        }
+
+        @Override
+        public int compare(ShardRouting s1, ShardRouting s2) {
+            if (s1.currentNodeId().equals(s2.currentNodeId())) {
+                // these shards on the the same node
+                return 0;
+            }
+            Double shard1rank = nodeRanks.get(s1.currentNodeId());
+            Double shard2rank = nodeRanks.get(s2.currentNodeId());
+            if (shard1rank != null && shard2rank != null) {
+                if (shard1rank < shard2rank) {
+                    return -1;
+                } else if (shard2rank < shard1rank) {
+                    return 1;
+                } else {
+                    // Yahtzee!
+                    return 0;
+                }
+            } else {
+                // One or both of the nodes don't have stats
+                return 0;
+            }
+        }
+    }
+
     /**
      * Returns true if no primaries are active or initializing for this shard
      */

diff --git a/core/src/main/java/org/elasticsearch/cluster/routing/OperationRouting.java b/core/src/main/java/org/elasticsearch/cluster/routing/OperationRouting.java
@@ -28,10 +28,12 @@
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.index.shard.ShardNotFoundException;
+import org.elasticsearch.node.ResponseCollectorService;
 
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -43,13 +45,24 @@
 
 public class OperationRouting extends AbstractComponent {
 
+    public static final Setting<Boolean> USE_ADAPTIVE_REPLICA_SELECTION_SETTING =
+            Setting.boolSetting("cluster.routing.use_adaptive_replica_selection", false,
+                    Setting.Property.Dynamic, Setting.Property.NodeScope);
+
     private String[] awarenessAttributes;
+    private boolean useAdaptiveReplicaSelection;
 
     public OperationRouting(Settings settings, ClusterSettings clusterSettings) {
         super(settings);
         this.awarenessAttributes = AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.get(settings);
+        this.useAdaptiveReplicaSelection = USE_ADAPTIVE_REPLICA_SELECTION_SETTING.get(settings);
         clusterSettings.addSettingsUpdateConsumer(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING,
             this::setAwarenessAttributes);
+        clusterSettings.addSettingsUpdateConsumer(USE_ADAPTIVE_REPLICA_SELECTION_SETTING, this::setUseAdaptiveReplicaSelection);
+    }
+
+    private void setUseAdaptiveReplicaSelection(boolean useAdaptiveReplicaSelection) {
+        this.useAdaptiveReplicaSelection = useAdaptiveReplicaSelection;
     }
 
     private void setAwarenessAttributes(String[] awarenessAttributes) {
@@ -61,19 +74,33 @@ public ShardIterator indexShards(ClusterState clusterState, String index, String
     }
 
     public ShardIterator getShards(ClusterState clusterState, String index, String id, @Nullable String routing, @Nullable String preference) {
-        return preferenceActiveShardIterator(shards(clusterState, index, id, routing), clusterState.nodes().getLocalNodeId(), clusterState.nodes(), preference);
+        return preferenceActiveShardIterator(shards(clusterState, index, id, routing), clusterState.nodes().getLocalNodeId(), clusterState.nodes(), preference, null, null);
     }
 
     public ShardIterator getShards(ClusterState clusterState, String index, int shardId, @Nullable String preference) {
         final IndexShardRoutingTable indexShard = clusterState.getRoutingTable().shardRoutingTable(index, shardId);
-        return preferenceActiveShardIterator(indexShard, clusterState.nodes().getLocalNodeId(), clusterState.nodes(), preference);
+        return preferenceActiveShardIterator(indexShard, clusterState.nodes().getLocalNodeId(), clusterState.nodes(), preference, null, null);
+    }
+
+    public GroupShardsIterator<ShardIterator> searchShards(ClusterState clusterState,
+                                                           String[] concreteIndices,
+                                                           @Nullable Map<String, Set<String>> routing,
+                                                           @Nullable String preference) {
+        return searchShards(clusterState, concreteIndices, routing, preference, null, null);
     }
 
-    public GroupShardsIterator<ShardIterator> searchShards(ClusterState clusterState, String[] concreteIndices, @Nullable Map<String, Set<String>> routing, @Nullable String preference) {
+
+    public GroupShardsIterator<ShardIterator> searchShards(ClusterState clusterState,
+                                                           String[] concreteIndices,
+                                                           @Nullable Map<String, Set<String>> routing,
+                                                           @Nullable String preference,
+                                                           @Nullable ResponseCollectorService collectorService,
+                                                           @Nullable Map<String, Long> nodeCounts) {
         final Set<IndexShardRoutingTable> shards = computeTargetedShards(clusterState, concreteIndices, routing);
         final Set<ShardIterator> set = new HashSet<>(shards.size());
         for (IndexShardRoutingTable shard : shards) {
-            ShardIterator iterator = preferenceActiveShardIterator(shard, clusterState.nodes().getLocalNodeId(), clusterState.nodes(), preference);
+            ShardIterator iterator = preferenceActiveShardIterator(shard,
+                    clusterState.nodes().getLocalNodeId(), clusterState.nodes(), preference, collectorService, nodeCounts);
             if (iterator != null) {
                 set.add(iterator);
             }
@@ -107,10 +134,17 @@ private Set<IndexShardRoutingTable> computeTargetedShards(ClusterState clusterSt
         return set;
     }
 
-    private ShardIterator preferenceActiveShardIterator(IndexShardRoutingTable indexShard, String localNodeId, DiscoveryNodes nodes, @Nullable String preference) {
+    private ShardIterator preferenceActiveShardIterator(IndexShardRoutingTable indexShard, String localNodeId,
+                                                        DiscoveryNodes nodes, @Nullable String preference,
+                                                        @Nullable ResponseCollectorService collectorService,
+                                                        @Nullable Map<String, Long> nodeCounts) {
         if (preference == null || preference.isEmpty()) {
             if (awarenessAttributes.length == 0) {
-                return indexShard.activeInitializingShardsRandomIt();
+                if (useAdaptiveReplicaSelection) {
+                    return indexShard.rankedActiveInitializingShardsIt(collectorService, nodeCounts);
+                } else {
+                    return indexShard.activeInitializingShardsRandomIt();
+                }
             } else {
                 return indexShard.preferAttributesActiveInitializingShardsIt(awarenessAttributes, nodes);
             }
@@ -141,7 +175,11 @@ private ShardIterator preferenceActiveShardIterator(IndexShardRoutingTable index
                 // no more preference
                 if (index == -1 || index == preference.length() - 1) {
                     if (awarenessAttributes.length == 0) {
-                        return indexShard.activeInitializingShardsRandomIt();
+                        if (useAdaptiveReplicaSelection) {
+                            return indexShard.rankedActiveInitializingShardsIt(collectorService, nodeCounts);
+                        } else {
+                            return indexShard.activeInitializingShardsRandomIt();
+                        }
                     } else {
                         return indexShard.preferAttributesActiveInitializingShardsIt(awarenessAttributes, nodes);
                     }

diff --git a/core/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/core/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -32,6 +32,7 @@
 import org.elasticsearch.cluster.NodeConnectionsService;
 import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
 import org.elasticsearch.cluster.metadata.MetaData;
+import org.elasticsearch.cluster.routing.OperationRouting;
 import org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings;
 import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 import org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
@@ -402,6 +403,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                     SearchModule.INDICES_MAX_CLAUSE_COUNT_SETTING,
                     ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING,
                     FastVectorHighlighter.SETTING_TV_HIGHLIGHT_MULTI_VALUE,
-                    Node.BREAKER_TYPE_KEY
+                    Node.BREAKER_TYPE_KEY,
+                    OperationRouting.USE_ADAPTIVE_REPLICA_SELECTION_SETTING
             )));
 }
diff --git a/core/src/main/java/org/elasticsearch/common/util/concurrent/EsExecutors.java b/core/src/main/java/org/elasticsearch/common/util/concurrent/EsExecutors.java
@@ -92,10 +92,6 @@ public static EsThreadPoolExecutor newFixed(String name, int size, int queueCapa
     public static EsThreadPoolExecutor newAutoQueueFixed(String name, int size, int initialQueueCapacity, int minQueueSize,
                                                          int maxQueueSize, int frameSize, TimeValue targetedResponseTime,
                                                          ThreadFactory threadFactory, ThreadContext contextHolder) {
-        if (initialQueueCapacity == minQueueSize && initialQueueCapacity == maxQueueSize) {
-            return newFixed(name, size, initialQueueCapacity, threadFactory, contextHolder);
-        }
-
         if (initialQueueCapacity <= 0) {
             throw new IllegalArgumentException("initial queue capacity for [" + name + "] executor must be positive, got: " +
                             initialQueueCapacity);

diff --git a/...main/java/org/elasticsearch/common/util/concurrent/QueueResizingEsThreadPoolExecutor.java b/...main/java/org/elasticsearch/common/util/concurrent/QueueResizingEsThreadPoolExecutor.java
@@ -79,9 +79,7 @@ public final class QueueResizingEsThreadPoolExecutor extends EsThreadPoolExecuto
         this.minQueueSize = minQueueSize;
         this.maxQueueSize = maxQueueSize;
         this.targetedResponseTimeNanos = targetedResponseTime.getNanos();
-        // We choose to start the EWMA with the targeted response time, reasoning that it is a
-        // better start point for a realistic task execution time than starting at 0
-        this.executionEWMA = new ExponentiallyWeightedMovingAverage(EWMA_ALPHA, targetedResponseTimeNanos);
+        this.executionEWMA = new ExponentiallyWeightedMovingAverage(EWMA_ALPHA, 0);
         logger.debug("thread pool [{}] will adjust queue by [{}] when determining automatic queue size",
                 name, QUEUE_ADJUSTMENT_AMOUNT);
     }