Address PR comments and change processor and query clause name. (#9)

* Address code review comments Signed-off-by: zane-neo <[email protected]> * Change lower case neural_sparse to upper case Signed-off-by: zane-neo <[email protected]> * Change back processor type name to sparse_encoding Signed-off-by: zane-neo <[email protected]> * Change names Signed-off-by: zane-neo <[email protected]> * Format code Signed-off-by: zane-neo <[email protected]> --------- Signed-off-by: zane-neo <[email protected]>
opensearch-project · Sep 28, 2023 · ddb5d69 · ddb5d69
1 parent 86b0d9c
commit ddb5d69
Show file tree

Hide file tree

Showing 13 changed files with 120 additions and 111 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,7 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Features
 Support sparse semantic retrieval by introducing `sparse_encoding` ingest processor and query builder ([#333](https://github.com/opensearch-project/neural-search/pull/333))
 ### Enhancements
-Add `max_token_score` parameter to improve the execution efficiency for `sparse_encoding` query clause ([#348](https://github.com/opensearch-project/neural-search/pull/348))
+Add `max_token_score` parameter to improve the execution efficiency for `neural_sparse` query clause ([#348](https://github.com/opensearch-project/neural-search/pull/348))
 ### Bug Fixes
 ### Infrastructure
 ### Documentation

diff --git a/...arch/query/BoundedLinearFeatureQuery.java → ...che/lucene/BoundedLinearFeatureQuery.java b/...arch/query/BoundedLinearFeatureQuery.java → ...che/lucene/BoundedLinearFeatureQuery.java
@@ -33,7 +33,7 @@
  * build the query and add an upperbound to it.
  */
 
-package org.opensearch.neuralsearch.query;
+package org.apache.lucene;
 
 import java.io.IOException;
 import java.util.Objects;
@@ -66,6 +66,10 @@
  * To mitigate this issue, we rewrite the FeatureQuery to BoundedLinearFeatureQuery. The caller can
  * set the token score upperbound of this query. And according to our use case, we use LinearFunction
  * as the score function.
+ *
+ * This class combines both <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java">FeatureQuery</a>
+ * and <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureField.java">FeatureField</a> together
+ * and will be deprecated after OpenSearch upgraded lucene to version 9.8.
  */
 
 public final class BoundedLinearFeatureQuery extends Query {
@@ -219,6 +223,7 @@ public String toString(String field) {
     // the field and decodeFeatureValue are modified from FeatureField.decodeFeatureValue
     static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
 
+    // Rewriting this function to make scoreUpperBound work.
     private float decodeFeatureValue(float freq) {
         if (freq > MAX_FREQ) {
             return scoreUpperBound;

diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
@@ -41,7 +41,7 @@
 import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizer;
 import org.opensearch.neuralsearch.query.HybridQueryBuilder;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
-import org.opensearch.neuralsearch.query.SparseEncodingQueryBuilder;
+import org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder;
 import org.opensearch.neuralsearch.search.query.HybridQueryPhaseSearcher;
 import org.opensearch.plugins.ActionPlugin;
 import org.opensearch.plugins.ExtensiblePlugin;
@@ -81,7 +81,7 @@ public Collection<Object> createComponents(
         final Supplier<RepositoriesService> repositoriesServiceSupplier
     ) {
         NeuralQueryBuilder.initialize(clientAccessor);
-        SparseEncodingQueryBuilder.initialize(clientAccessor);
+        NeuralSparseQueryBuilder.initialize(clientAccessor);
         normalizationProcessorWorkflow = new NormalizationProcessorWorkflow(new ScoreNormalizer(), new ScoreCombiner());
         return List.of(clientAccessor);
     }
@@ -91,7 +91,7 @@ public List<QuerySpec<?>> getQueries() {
         return Arrays.asList(
             new QuerySpec<>(NeuralQueryBuilder.NAME, NeuralQueryBuilder::new, NeuralQueryBuilder::fromXContent),
             new QuerySpec<>(HybridQueryBuilder.NAME, HybridQueryBuilder::new, HybridQueryBuilder::fromXContent),
-            new QuerySpec<>(SparseEncodingQueryBuilder.NAME, SparseEncodingQueryBuilder::new, SparseEncodingQueryBuilder::fromXContent)
+            new QuerySpec<>(NeuralSparseQueryBuilder.NAME, NeuralSparseQueryBuilder::new, NeuralSparseQueryBuilder::fromXContent)
         );
     }
 

diff --git a/.../neuralsearch/processor/NLPProcessor.java → ...lsearch/processor/InferenceProcessor.java b/.../neuralsearch/processor/NLPProcessor.java → ...lsearch/processor/InferenceProcessor.java
@@ -32,7 +32,7 @@
  * and set the target fields according to the field name map.
  */
 @Log4j2
-public abstract class NLPProcessor extends AbstractProcessor {
+public abstract class InferenceProcessor extends AbstractProcessor {
 
     public static final String MODEL_ID_FIELD = "model_id";
     public static final String FIELD_MAP_FIELD = "field_map";
@@ -51,7 +51,7 @@ public abstract class NLPProcessor extends AbstractProcessor {
 
     private final Environment environment;
 
-    public NLPProcessor(
+    public InferenceProcessor(
         String tag,
         String description,
         String type,
@@ -249,7 +249,7 @@ protected void setVectorFieldsToDocument(IngestDocument ingestDocument, Map<Stri
     @SuppressWarnings({ "unchecked" })
     @VisibleForTesting
     Map<String, Object> buildNLPResult(Map<String, Object> processorMap, List<?> results, Map<String, Object> sourceAndMetadataMap) {
-        NLPProcessor.IndexWrapper indexWrapper = new NLPProcessor.IndexWrapper(0);
+        InferenceProcessor.IndexWrapper indexWrapper = new InferenceProcessor.IndexWrapper(0);
         Map<String, Object> result = new LinkedHashMap<>();
         for (Map.Entry<String, Object> knnMapEntry : processorMap.entrySet()) {
             String knnKey = knnMapEntry.getKey();
@@ -270,7 +270,7 @@ private void putNLPResultToSourceMapForMapType(
         String processorKey,
         Object sourceValue,
         List<?> results,
-        NLPProcessor.IndexWrapper indexWrapper,
+        InferenceProcessor.IndexWrapper indexWrapper,
         Map<String, Object> sourceAndMetadataMap
     ) {
         if (processorKey == null || sourceAndMetadataMap == null || sourceValue == null) return;
@@ -294,7 +294,7 @@ private void putNLPResultToSourceMapForMapType(
     private List<Map<String, Object>> buildNLPResultForListType(
         List<String> sourceValue,
         List<?> results,
-        NLPProcessor.IndexWrapper indexWrapper
+        InferenceProcessor.IndexWrapper indexWrapper
     ) {
         List<Map<String, Object>> keyToResult = new ArrayList<>();
         IntStream.range(0, sourceValue.size())

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java
@@ -22,7 +22,7 @@
  * and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the sparse encoding results.
  */
 @Log4j2
-public final class SparseEncodingProcessor extends NLPProcessor {
+public final class SparseEncodingProcessor extends InferenceProcessor {
 
     public static final String TYPE = "sparse_encoding";
     public static final String LIST_TYPE_NESTED_MAP_KEY = "sparse_encoding";

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java
@@ -21,7 +21,7 @@
  * and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the text embedding results.
  */
 @Log4j2
-public final class TextEmbeddingProcessor extends NLPProcessor {
+public final class TextEmbeddingProcessor extends InferenceProcessor {
 
     public static final String TYPE = "text_embedding";
     public static final String LIST_TYPE_NESTED_MAP_KEY = "knn";

diff --git a/...rch/query/SparseEncodingQueryBuilder.java → ...earch/query/NeuralSparseQueryBuilder.java b/...rch/query/SparseEncodingQueryBuilder.java → ...earch/query/NeuralSparseQueryBuilder.java
@@ -21,6 +21,7 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.builder.EqualsBuilder;
 import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.lucene.BoundedLinearFeatureQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
@@ -44,7 +45,7 @@
 import com.google.common.annotations.VisibleForTesting;
 
 /**
- * SparseEncodingQueryBuilder is responsible for handling "sparse_encoding" query types. It uses an ML SPARSE_ENCODING model
+ * SparseEncodingQueryBuilder is responsible for handling "neural_sparse" query types. It uses an ML NEURAL_SPARSE model
  * or SPARSE_TOKENIZE model to produce a Map with String keys and Float values for input text. Then it will be transformed
  * to Lucene FeatureQuery wrapped by Lucene BooleanQuery.
  */
@@ -55,8 +56,8 @@
 @Accessors(chain = true, fluent = true)
 @NoArgsConstructor
 @AllArgsConstructor
-public class SparseEncodingQueryBuilder extends AbstractQueryBuilder<SparseEncodingQueryBuilder> {
-    public static final String NAME = "sparse_encoding";
+public class NeuralSparseQueryBuilder extends AbstractQueryBuilder<NeuralSparseQueryBuilder> {
+    public static final String NAME = "neural_sparse";
     @VisibleForTesting
     static final ParseField QUERY_TEXT_FIELD = new ParseField("query_text");
     @VisibleForTesting
@@ -67,7 +68,7 @@ public class SparseEncodingQueryBuilder extends AbstractQueryBuilder<SparseEncod
     private static MLCommonsClientAccessor ML_CLIENT;
 
     public static void initialize(MLCommonsClientAccessor mlClient) {
-        SparseEncodingQueryBuilder.ML_CLIENT = mlClient;
+        NeuralSparseQueryBuilder.ML_CLIENT = mlClient;
     }
 
     private String fieldName;
@@ -82,7 +83,7 @@ public static void initialize(MLCommonsClientAccessor mlClient) {
      * @param in StreamInput to initialize object from
      * @throws IOException thrown if unable to read from input stream
      */
-    public SparseEncodingQueryBuilder(StreamInput in) throws IOException {
+    public NeuralSparseQueryBuilder(StreamInput in) throws IOException {
         super(in);
         this.fieldName = in.readString();
         this.queryText = in.readString();
@@ -104,7 +105,7 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws
         xContentBuilder.startObject(fieldName);
         xContentBuilder.field(QUERY_TEXT_FIELD.getPreferredName(), queryText);
         xContentBuilder.field(MODEL_ID_FIELD.getPreferredName(), modelId);
-        if (null != maxTokenScore) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore);
+        if (maxTokenScore != null) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore);
         printBoostAndQueryName(xContentBuilder);
         xContentBuilder.endObject();
         xContentBuilder.endObject();
@@ -122,8 +123,8 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws
      * @return NeuralQueryBuilder
      * @throws IOException can be thrown by parser
      */
-    public static SparseEncodingQueryBuilder fromXContent(XContentParser parser) throws IOException {
-        SparseEncodingQueryBuilder sparseEncodingQueryBuilder = new SparseEncodingQueryBuilder();
+    public static NeuralSparseQueryBuilder fromXContent(XContentParser parser) throws IOException {
+        NeuralSparseQueryBuilder sparseEncodingQueryBuilder = new NeuralSparseQueryBuilder();
         if (parser.currentToken() != XContentParser.Token.START_OBJECT) {
             throw new ParsingException(parser.getTokenLocation(), "First token of " + NAME + "query must be START_OBJECT");
         }
@@ -153,14 +154,14 @@ public static SparseEncodingQueryBuilder fromXContent(XContentParser parser) thr
             sparseEncodingQueryBuilder.modelId(),
             String.format(Locale.ROOT, "%s field must be provided for [%s] query", MODEL_ID_FIELD.getPreferredName(), NAME)
         );
-        if (null != sparseEncodingQueryBuilder.maxTokenScore && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
+        if (sparseEncodingQueryBuilder.maxTokenScore != null && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
             throw new IllegalArgumentException(MAX_TOKEN_SCORE_FIELD.getPreferredName() + " must be larger than 0.");
         }
 
         return sparseEncodingQueryBuilder;
     }
 
-    private static void parseQueryParams(XContentParser parser, SparseEncodingQueryBuilder sparseEncodingQueryBuilder) throws IOException {
+    private static void parseQueryParams(XContentParser parser, NeuralSparseQueryBuilder sparseEncodingQueryBuilder) throws IOException {
         XContentParser.Token token;
         String currentFieldName = "";
         while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
@@ -212,7 +213,7 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) throws
                 }, actionListener::onFailure)
             ))
         );
-        return new SparseEncodingQueryBuilder().fieldName(fieldName)
+        return new NeuralSparseQueryBuilder().fieldName(fieldName)
             .queryText(queryText)
             .modelId(modelId)
             .maxTokenScore(maxTokenScore)
@@ -227,7 +228,7 @@ protected Query doToQuery(QueryShardContext context) throws IOException {
         Map<String, Float> queryTokens = queryTokensSupplier.get();
         validateQueryTokens(queryTokens);
 
-        final Float scoreUpperBound = null != maxTokenScore ? maxTokenScore : Float.MAX_VALUE;
+        final Float scoreUpperBound = maxTokenScore != null ? maxTokenScore : Float.MAX_VALUE;
 
         BooleanQuery.Builder builder = new BooleanQuery.Builder();
         for (Map.Entry<String, Float> entry : queryTokens.entrySet()) {
@@ -272,7 +273,7 @@ private static void validateQueryTokens(Map<String, Float> queryTokens) {
     }
 
     @Override
-    protected boolean doEquals(SparseEncodingQueryBuilder obj) {
+    protected boolean doEquals(NeuralSparseQueryBuilder obj) {
         if (this == obj) return true;
         if (obj == null || getClass() != obj.getClass()) return false;
         EqualsBuilder equalsBuilder = new EqualsBuilder().append(fieldName, obj.fieldName)

diff --git a/src/main/java/org/opensearch/neuralsearch/util/TokenWeightUtil.java b/src/main/java/org/opensearch/neuralsearch/util/TokenWeightUtil.java
@@ -12,7 +12,7 @@
 import java.util.stream.Collectors;
 
 /**
- * Utility class for working with sparse_encoding queries and ingest processor.
+ * Utility class for working with neural_sparse queries and ingest processor.
  * Used to fetch the (token, weight) Map from the response returned by {@link org.opensearch.neuralsearch.ml.MLCommonsClientAccessor}
  *
  */