opensearch-project · VijayanB · Jun 11, 2024 · Jun 3, 2024 · Jun 6, 2024 · Jun 11, 2024
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Optimize parameter parsing in text chunking processor ([#733](https://github.com/opensearch-project/neural-search/pull/733))
 - Use lazy initialization for priority queue of hits and scores to improve latencies by 20% ([#746](https://github.com/opensearch-project/neural-search/pull/746))
 - Optimize max score calculation in the Query Phase of the Hybrid Search ([765](https://github.com/opensearch-project/neural-search/pull/765))
+- Implement parallel execution of sub-queries for hybrid search ([#749](https://github.com/opensearch-project/neural-search/pull/749))
 ### Bug Fixes
 - Total hit count fix in Hybrid Query ([756](https://github.com/opensearch-project/neural-search/pull/756))
 - Fix map type validation issue in multiple pipeline processors ([#661](https://github.com/opensearch-project/neural-search/pull/661))

@@ -0,0 +1,86 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.executors;
+
+import lombok.AccessLevel;
+import lombok.NoArgsConstructor;
+import lombok.experimental.PackagePrivate;
+import org.apache.lucene.search.TaskExecutor;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.common.util.concurrent.OpenSearchExecutors;
+import org.opensearch.threadpool.ExecutorBuilder;
+import org.opensearch.threadpool.FixedExecutorBuilder;
+import org.opensearch.threadpool.ThreadPool;
+
+/**
+ * {@link HybridQueryExecutor} provides necessary implementation and instances to execute
+ * sub-queries from hybrid query in parallel as a Task by caller. This ensures that one thread pool
+ * is used for hybrid query execution per node. The number of parallelization is also constrained
+ * by twice allocated processor count since most of the operation from hybrid search is expected to be
+ * short-lived thread. This will help us to achieve optimal parallelization and reasonable throughput.
+ */
+@NoArgsConstructor(access = AccessLevel.PRIVATE)
+public final class HybridQueryExecutor {
+    private static final String HYBRID_QUERY_EXEC_THREAD_POOL_NAME = "_plugin_neural_search_hybrid_query_executor";
+    private static final Integer HYBRID_QUERY_EXEC_THREAD_POOL_QUEUE_SIZE = 1000;
+    private static final Integer MAX_THREAD_SIZE = 1000;
+    private static final Integer MIN_THREAD_SIZE = 2;
+    private static final Integer PROCESSOR_COUNT_MULTIPLIER = 2;
+    private static TaskExecutor taskExecutor;
+
+    /**
+     * Provide fixed executor builder to use for hybrid query executors
+     * @param settings Node level settings
+     * @return the executor builder for hybrid query's custom thread pool.
+     */
+    public static ExecutorBuilder getExecutorBuilder(final Settings settings) {
+
+        int numberOfThreads = getFixedNumberOfThreadSize(settings);
+        return new FixedExecutorBuilder(
+            settings,
+            HYBRID_QUERY_EXEC_THREAD_POOL_NAME,
+            numberOfThreads,
+            HYBRID_QUERY_EXEC_THREAD_POOL_QUEUE_SIZE,
+            HYBRID_QUERY_EXEC_THREAD_POOL_NAME
+        );
+    }
+
+    /**
+     * Initialize @{@link TaskExecutor} to run tasks concurrently using {@link ThreadPool}
+     * @param threadPool OpenSearch's thread pool instance
+     */
+    public static void initialize(ThreadPool threadPool) {
+        if (threadPool == null) {
+            throw new IllegalArgumentException(
+                "Argument thread-pool to Hybrid Query Executor cannot be null. This is required to build executor to run actions in parallel"
+            );
+        }
+        taskExecutor = new TaskExecutor(threadPool.executor(HYBRID_QUERY_EXEC_THREAD_POOL_NAME));
+    }
+
+    /**
+     * Return TaskExecutor Wrapper that helps runs tasks concurrently
+     * @return TaskExecutor instance to help run search tasks in parallel
+     */
+    public static TaskExecutor getExecutor() {
+        return taskExecutor != null ? taskExecutor : new TaskExecutor(Runnable::run);
+    }
+
+    @PackagePrivate
+    public static String getThreadPoolName() {
+        return HYBRID_QUERY_EXEC_THREAD_POOL_NAME;
+    }
+
+    /**
+     * Will use thread size as twice the default allocated processor. We selected twice allocated processor
+     * since hybrid query action is expected to be short-lived . This will balance throughput and latency
+     * To avoid out of range, we will return 2 as minimum processor count and 1000 as maximum thread size
+     */
+    private static int getFixedNumberOfThreadSize(final Settings settings) {
+        final int allocatedProcessors = OpenSearchExecutors.allocatedProcessors(settings);
+        int threadSize = Math.max(PROCESSOR_COUNT_MULTIPLIER * allocatedProcessors, MIN_THREAD_SIZE);
+        return Math.min(threadSize, MAX_THREAD_SIZE);
+    }
+}
@@ -0,0 +1,38 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.executors;
+
+import lombok.AccessLevel;
+import lombok.Getter;
+import lombok.RequiredArgsConstructor;
+import lombok.Synchronized;
+
+import java.util.Optional;
+import java.util.function.Function;
+
+/**
+ * {@link HybridQueryExecutorCollector} is a generic Collector used by Hybrid Search Query during
+ * Query phase to parallelize sub query's action to improve latency
+ */
+@RequiredArgsConstructor(staticName = "newCollector", access = AccessLevel.PACKAGE)
+public final class HybridQueryExecutorCollector<I, R> {
+
+    // will be used as input for all instances of collector generated by newCollector method,
+    // if it is required for collect operation
+    private final I param;
+
+    // getResult should only be called after collector's collect method is invoked.
+    @Getter(onMethod_ = { @Synchronized })
+    private Optional<R> result = Optional.empty();
+
+    /**
+     * Called once for every time an action has to be performed on this Collector
+     * @param action function that will be executed and result will be stored at result.
+     */
+    @Synchronized
+    public void collect(Function<I, R> action) {
+        result = Optional.ofNullable(action.apply(param));
+    }
+}
@@ -0,0 +1,19 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.executors;
+
+/**
+ * {@link HybridQueryExecutorCollectorManager} is responsible for creating new {@link HybridQueryExecutorCollector} instances
+ */
+public interface HybridQueryExecutorCollectorManager<C extends HybridQueryExecutorCollector> {
+    /**
+     * Return a new Collector instance that extends {@link HybridQueryExecutor}.
+     * This will be used during Hybrid Search when sub queries wants to execute part of
+     * operation that is independent of each other that can be parallelized to improve
+     * the performance.
+     * @return HybridQueryExecutorCollector
+     */
+    C newCollector();
+}
@@ -0,0 +1,76 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.executors;
+
+import lombok.NonNull;
+import lombok.RequiredArgsConstructor;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+/**
+ * {@link HybridQueryRewriteCollectorManager} is responsible for creating {@link HybridQueryExecutorCollector}
+ * instances. Useful to create {@link HybridQueryExecutorCollector} instances that rewrites {@link Query} into primitive
+ * {@link Query} using {@link IndexSearcher}
+ */
+@RequiredArgsConstructor
+public final class HybridQueryRewriteCollectorManager implements HybridQueryExecutorCollectorManager<HybridQueryExecutorCollector> {
+
+    private @NonNull IndexSearcher searcher;
+
+    /**
+     * Returns new {@link HybridQueryExecutorCollector} to facilitate parallel execution
+     * @return HybridQueryExecutorCollector instance
+     */
+    @Override
+    public HybridQueryExecutorCollector<IndexSearcher, Map.Entry<Query, Boolean>> newCollector() {
+        return HybridQueryExecutorCollector.newCollector(searcher);
+    }
+
+    /**
+     * Returns list of {@link Query} that were rewritten by collectors. If collector doesn't
+     * have any result, null will be inserted to the result.
+     * This method must be called after collection is finished on all provided collectors.
+     * @param collectors list of collectors
+     * @return list of {@link Query} that was rewritten by corresponding collector from input.
+     */
+    public List<Query> getQueriesAfterRewrite(List<HybridQueryExecutorCollector<IndexSearcher, Map.Entry<Query, Boolean>>> collectors) {
+        List<Query> rewrittenQueries = new ArrayList<>();
+        for (HybridQueryExecutorCollector<IndexSearcher, Map.Entry<Query, Boolean>> collector : collectors) {
+            if (collector.getResult().isPresent()) {
+                rewrittenQueries.add(collector.getResult().get().getKey());
+            } else {
+                // if for some reason collector didn't have result, we will add null to its
+                // position in the result.
+                rewrittenQueries.add(null);
+            }
+        }
+        return rewrittenQueries;
+    }
+
+    /**
+     * Returns true if any of the {@link Query} from collector were actually rewritten.
+     * If any of the given collector doesn't have result, it will be ignored as if that
+     * instance did not exist. This method must be called after collection is finished
+     * on all provided collectors.
+     * @param collectors List of collectors to check any of their query was rewritten during
+     *                   collect step.
+     * @return at least one query is rewritten by any of the collectors
+     */
+    public boolean anyQueryRewrite(List<HybridQueryExecutorCollector<IndexSearcher, Map.Entry<Query, Boolean>>> collectors) {
+        // return true if at least one query is rewritten
+        for (HybridQueryExecutorCollector<IndexSearcher, Map.Entry<Query, Boolean>> collector : collectors) {
+            final Optional<Map.Entry<Query, Boolean>> result = collector.getResult();
+            if (result.isPresent() && result.get().getValue()) {
+                return true;
+            }
+        }
+        return false;
+    }
+}
@@ -0,0 +1,58 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.executors;
+
+import lombok.NonNull;
+import lombok.RequiredArgsConstructor;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.ScorerSupplier;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+/**
+ * HybridQueryScoreSupplierCollectorManager is responsible for creating {@link HybridQueryExecutorCollector} instances.
+ * Useful to create {@link HybridQueryExecutorCollector} instances that build {@link ScorerSupplier} from
+ * given weight.
+ */
+@RequiredArgsConstructor
+public final class HybridQueryScoreSupplierCollectorManager
+    implements
+        HybridQueryExecutorCollectorManager<HybridQueryExecutorCollector<LeafReaderContext, ScorerSupplier>> {
+
+    private @NonNull LeafReaderContext context;
+
+    /**
+     * Creates new {@link HybridQueryExecutorCollector} instance everytime to facilitate parallel execution
+     * by individual tasks
+     * @return new instance of HybridQueryExecutorCollector
+     */
+    @Override
+    public HybridQueryExecutorCollector<LeafReaderContext, ScorerSupplier> newCollector() {
+        return HybridQueryExecutorCollector.newCollector(context);
+    }
+
+    /**
+     * mergeScoreSuppliers will build list of scoreSupplier from given list of collectors.
+     * This method should be called after HybridQueryExecutorCollector's collect method is called.
+     * If collectors didn't have any result, null will be added to list.
+     * This method must be called after collection is finished on all provided collectors.
+     * @param collectors List of collectors which is used to perform collection in parallel
+     * @return list of {@link ScorerSupplier}
+     */
+    public List<ScorerSupplier> mergeScoreSuppliers(List<HybridQueryExecutorCollector<LeafReaderContext, ScorerSupplier>> collectors) {
+        List<ScorerSupplier> scorerSuppliers = new ArrayList<>();
+        for (HybridQueryExecutorCollector<LeafReaderContext, ScorerSupplier> collector : collectors) {
+            Optional<ScorerSupplier> result = collector.getResult();
+            if (result.isPresent()) {
+                scorerSuppliers.add(result.get());
+            } else {
+                scorerSuppliers.add(null);
+            }
+        }
+        return scorerSuppliers;
+    }
+}
@@ -18,13 +18,15 @@
 import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
 import org.opensearch.cluster.service.ClusterService;
 import org.opensearch.common.settings.Setting;
+import org.opensearch.common.settings.Settings;
 import org.opensearch.common.util.FeatureFlags;
 import org.opensearch.core.common.io.stream.NamedWriteableRegistry;
 import org.opensearch.core.xcontent.NamedXContentRegistry;
 import org.opensearch.env.Environment;
 import org.opensearch.env.NodeEnvironment;
 import org.opensearch.ingest.Processor;
 import org.opensearch.ml.client.MachineLearningNodeClient;
+import org.opensearch.neuralsearch.executors.HybridQueryExecutor;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
 import org.opensearch.neuralsearch.processor.NeuralQueryEnricherProcessor;
 import org.opensearch.neuralsearch.processor.NeuralSparseTwoPhaseProcessor;
@@ -63,6 +65,7 @@
 import org.opensearch.search.pipeline.SearchRequestProcessor;
 import org.opensearch.search.pipeline.SearchResponseProcessor;
 import org.opensearch.search.query.QueryPhaseSearcher;
+import org.opensearch.threadpool.ExecutorBuilder;
 import org.opensearch.threadpool.ThreadPool;
 import org.opensearch.watcher.ResourceWatcherService;
 
@@ -95,6 +98,7 @@ public Collection<Object> createComponents(
         NeuralSearchClusterUtil.instance().initialize(clusterService);
         NeuralQueryBuilder.initialize(clientAccessor);
         NeuralSparseQueryBuilder.initialize(clientAccessor);
+        HybridQueryExecutor.initialize(threadPool);
         normalizationProcessorWorkflow = new NormalizationProcessorWorkflow(new ScoreNormalizer(), new ScoreCombiner());
         return List.of(clientAccessor);
     }
@@ -108,6 +112,11 @@ public List<QuerySpec<?>> getQueries() {
         );
     }
 
+    @Override
+    public List<ExecutorBuilder<?>> getExecutorBuilders(Settings settings) {
+        return List.of(HybridQueryExecutor.getExecutorBuilder(settings));
+    }
+
     @Override
     public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
         clientAccessor = new MLCommonsClientAccessor(new MachineLearningNodeClient(parameters.client));