Skip to content

Commit

Permalink
Address PR comments and change processor and query clause name. (#9)
Browse files Browse the repository at this point in the history
* Address code review comments

Signed-off-by: zane-neo <[email protected]>

* Change lower case neural_sparse to upper case

Signed-off-by: zane-neo <[email protected]>

* Change back processor type name to sparse_encoding

Signed-off-by: zane-neo <[email protected]>

* Change names

Signed-off-by: zane-neo <[email protected]>

* Format code

Signed-off-by: zane-neo <[email protected]>

---------

Signed-off-by: zane-neo <[email protected]>
  • Loading branch information
zane-neo authored Sep 28, 2023
1 parent 86b0d9c commit ddb5d69
Show file tree
Hide file tree
Showing 13 changed files with 120 additions and 111 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Features
Support sparse semantic retrieval by introducing `sparse_encoding` ingest processor and query builder ([#333](https://github.com/opensearch-project/neural-search/pull/333))
### Enhancements
Add `max_token_score` parameter to improve the execution efficiency for `sparse_encoding` query clause ([#348](https://github.com/opensearch-project/neural-search/pull/348))
Add `max_token_score` parameter to improve the execution efficiency for `neural_sparse` query clause ([#348](https://github.com/opensearch-project/neural-search/pull/348))
### Bug Fixes
### Infrastructure
### Documentation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
* build the query and add an upperbound to it.
*/

package org.opensearch.neuralsearch.query;
package org.apache.lucene;

import java.io.IOException;
import java.util.Objects;
Expand Down Expand Up @@ -66,6 +66,10 @@
* To mitigate this issue, we rewrite the FeatureQuery to BoundedLinearFeatureQuery. The caller can
* set the token score upperbound of this query. And according to our use case, we use LinearFunction
* as the score function.
*
* This class combines both <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java">FeatureQuery</a>
* and <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureField.java">FeatureField</a> together
* and will be deprecated after OpenSearch upgraded lucene to version 9.8.
*/

public final class BoundedLinearFeatureQuery extends Query {
Expand Down Expand Up @@ -219,6 +223,7 @@ public String toString(String field) {
// the field and decodeFeatureValue are modified from FeatureField.decodeFeatureValue
static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;

Check warning on line 224 in src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java#L224

Added line #L224 was not covered by tests

// Rewriting this function to make scoreUpperBound work.
private float decodeFeatureValue(float freq) {
if (freq > MAX_FREQ) {
return scoreUpperBound;

Check warning on line 229 in src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java#L229

Added line #L229 was not covered by tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizer;
import org.opensearch.neuralsearch.query.HybridQueryBuilder;
import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
import org.opensearch.neuralsearch.query.SparseEncodingQueryBuilder;
import org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder;
import org.opensearch.neuralsearch.search.query.HybridQueryPhaseSearcher;
import org.opensearch.plugins.ActionPlugin;
import org.opensearch.plugins.ExtensiblePlugin;
Expand Down Expand Up @@ -81,7 +81,7 @@ public Collection<Object> createComponents(
final Supplier<RepositoriesService> repositoriesServiceSupplier
) {
NeuralQueryBuilder.initialize(clientAccessor);
SparseEncodingQueryBuilder.initialize(clientAccessor);
NeuralSparseQueryBuilder.initialize(clientAccessor);

Check warning on line 84 in src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java#L84

Added line #L84 was not covered by tests
normalizationProcessorWorkflow = new NormalizationProcessorWorkflow(new ScoreNormalizer(), new ScoreCombiner());
return List.of(clientAccessor);
}
Expand All @@ -91,7 +91,7 @@ public List<QuerySpec<?>> getQueries() {
return Arrays.asList(
new QuerySpec<>(NeuralQueryBuilder.NAME, NeuralQueryBuilder::new, NeuralQueryBuilder::fromXContent),
new QuerySpec<>(HybridQueryBuilder.NAME, HybridQueryBuilder::new, HybridQueryBuilder::fromXContent),
new QuerySpec<>(SparseEncodingQueryBuilder.NAME, SparseEncodingQueryBuilder::new, SparseEncodingQueryBuilder::fromXContent)
new QuerySpec<>(NeuralSparseQueryBuilder.NAME, NeuralSparseQueryBuilder::new, NeuralSparseQueryBuilder::fromXContent)
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
* and set the target fields according to the field name map.
*/
@Log4j2
public abstract class NLPProcessor extends AbstractProcessor {
public abstract class InferenceProcessor extends AbstractProcessor {

public static final String MODEL_ID_FIELD = "model_id";
public static final String FIELD_MAP_FIELD = "field_map";
Expand All @@ -51,7 +51,7 @@ public abstract class NLPProcessor extends AbstractProcessor {

private final Environment environment;

public NLPProcessor(
public InferenceProcessor(
String tag,
String description,
String type,
Expand Down Expand Up @@ -249,7 +249,7 @@ protected void setVectorFieldsToDocument(IngestDocument ingestDocument, Map<Stri
@SuppressWarnings({ "unchecked" })
@VisibleForTesting
Map<String, Object> buildNLPResult(Map<String, Object> processorMap, List<?> results, Map<String, Object> sourceAndMetadataMap) {
NLPProcessor.IndexWrapper indexWrapper = new NLPProcessor.IndexWrapper(0);
InferenceProcessor.IndexWrapper indexWrapper = new InferenceProcessor.IndexWrapper(0);
Map<String, Object> result = new LinkedHashMap<>();
for (Map.Entry<String, Object> knnMapEntry : processorMap.entrySet()) {
String knnKey = knnMapEntry.getKey();
Expand All @@ -270,7 +270,7 @@ private void putNLPResultToSourceMapForMapType(
String processorKey,
Object sourceValue,
List<?> results,
NLPProcessor.IndexWrapper indexWrapper,
InferenceProcessor.IndexWrapper indexWrapper,
Map<String, Object> sourceAndMetadataMap
) {
if (processorKey == null || sourceAndMetadataMap == null || sourceValue == null) return;
Expand All @@ -294,7 +294,7 @@ private void putNLPResultToSourceMapForMapType(
private List<Map<String, Object>> buildNLPResultForListType(
List<String> sourceValue,
List<?> results,
NLPProcessor.IndexWrapper indexWrapper
InferenceProcessor.IndexWrapper indexWrapper
) {
List<Map<String, Object>> keyToResult = new ArrayList<>();
IntStream.range(0, sourceValue.size())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the sparse encoding results.
*/
@Log4j2
public final class SparseEncodingProcessor extends NLPProcessor {
public final class SparseEncodingProcessor extends InferenceProcessor {

public static final String TYPE = "sparse_encoding";
public static final String LIST_TYPE_NESTED_MAP_KEY = "sparse_encoding";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* and field_map can be used to indicate which fields needs text embedding and the corresponding keys for the text embedding results.
*/
@Log4j2
public final class TextEmbeddingProcessor extends NLPProcessor {
public final class TextEmbeddingProcessor extends InferenceProcessor {

public static final String TYPE = "text_embedding";
public static final String LIST_TYPE_NESTED_MAP_KEY = "knn";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.lucene.BoundedLinearFeatureQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
Expand All @@ -44,7 +45,7 @@
import com.google.common.annotations.VisibleForTesting;

/**
* SparseEncodingQueryBuilder is responsible for handling "sparse_encoding" query types. It uses an ML SPARSE_ENCODING model
* SparseEncodingQueryBuilder is responsible for handling "neural_sparse" query types. It uses an ML NEURAL_SPARSE model
* or SPARSE_TOKENIZE model to produce a Map with String keys and Float values for input text. Then it will be transformed
* to Lucene FeatureQuery wrapped by Lucene BooleanQuery.
*/
Expand All @@ -55,8 +56,8 @@
@Accessors(chain = true, fluent = true)
@NoArgsConstructor
@AllArgsConstructor
public class SparseEncodingQueryBuilder extends AbstractQueryBuilder<SparseEncodingQueryBuilder> {
public static final String NAME = "sparse_encoding";
public class NeuralSparseQueryBuilder extends AbstractQueryBuilder<NeuralSparseQueryBuilder> {
public static final String NAME = "neural_sparse";
@VisibleForTesting
static final ParseField QUERY_TEXT_FIELD = new ParseField("query_text");
@VisibleForTesting
Expand All @@ -67,7 +68,7 @@ public class SparseEncodingQueryBuilder extends AbstractQueryBuilder<SparseEncod
private static MLCommonsClientAccessor ML_CLIENT;

public static void initialize(MLCommonsClientAccessor mlClient) {
SparseEncodingQueryBuilder.ML_CLIENT = mlClient;
NeuralSparseQueryBuilder.ML_CLIENT = mlClient;
}

private String fieldName;
Expand All @@ -82,7 +83,7 @@ public static void initialize(MLCommonsClientAccessor mlClient) {
* @param in StreamInput to initialize object from
* @throws IOException thrown if unable to read from input stream
*/
public SparseEncodingQueryBuilder(StreamInput in) throws IOException {
public NeuralSparseQueryBuilder(StreamInput in) throws IOException {
super(in);
this.fieldName = in.readString();
this.queryText = in.readString();
Expand All @@ -104,7 +105,7 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws
xContentBuilder.startObject(fieldName);
xContentBuilder.field(QUERY_TEXT_FIELD.getPreferredName(), queryText);
xContentBuilder.field(MODEL_ID_FIELD.getPreferredName(), modelId);
if (null != maxTokenScore) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore);
if (maxTokenScore != null) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore);
printBoostAndQueryName(xContentBuilder);
xContentBuilder.endObject();
xContentBuilder.endObject();
Expand All @@ -122,8 +123,8 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws
* @return NeuralQueryBuilder
* @throws IOException can be thrown by parser
*/
public static SparseEncodingQueryBuilder fromXContent(XContentParser parser) throws IOException {
SparseEncodingQueryBuilder sparseEncodingQueryBuilder = new SparseEncodingQueryBuilder();
public static NeuralSparseQueryBuilder fromXContent(XContentParser parser) throws IOException {
NeuralSparseQueryBuilder sparseEncodingQueryBuilder = new NeuralSparseQueryBuilder();
if (parser.currentToken() != XContentParser.Token.START_OBJECT) {
throw new ParsingException(parser.getTokenLocation(), "First token of " + NAME + "query must be START_OBJECT");
}
Expand Down Expand Up @@ -153,14 +154,14 @@ public static SparseEncodingQueryBuilder fromXContent(XContentParser parser) thr
sparseEncodingQueryBuilder.modelId(),
String.format(Locale.ROOT, "%s field must be provided for [%s] query", MODEL_ID_FIELD.getPreferredName(), NAME)
);
if (null != sparseEncodingQueryBuilder.maxTokenScore && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
if (sparseEncodingQueryBuilder.maxTokenScore != null && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
throw new IllegalArgumentException(MAX_TOKEN_SCORE_FIELD.getPreferredName() + " must be larger than 0.");

Check warning on line 158 in src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java#L158

Added line #L158 was not covered by tests
}

return sparseEncodingQueryBuilder;
}

private static void parseQueryParams(XContentParser parser, SparseEncodingQueryBuilder sparseEncodingQueryBuilder) throws IOException {
private static void parseQueryParams(XContentParser parser, NeuralSparseQueryBuilder sparseEncodingQueryBuilder) throws IOException {
XContentParser.Token token;
String currentFieldName = "";
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
Expand Down Expand Up @@ -212,7 +213,7 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) throws
}, actionListener::onFailure)
))
);
return new SparseEncodingQueryBuilder().fieldName(fieldName)
return new NeuralSparseQueryBuilder().fieldName(fieldName)
.queryText(queryText)
.modelId(modelId)
.maxTokenScore(maxTokenScore)
Expand All @@ -227,7 +228,7 @@ protected Query doToQuery(QueryShardContext context) throws IOException {
Map<String, Float> queryTokens = queryTokensSupplier.get();
validateQueryTokens(queryTokens);

final Float scoreUpperBound = null != maxTokenScore ? maxTokenScore : Float.MAX_VALUE;
final Float scoreUpperBound = maxTokenScore != null ? maxTokenScore : Float.MAX_VALUE;

BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (Map.Entry<String, Float> entry : queryTokens.entrySet()) {
Expand Down Expand Up @@ -272,7 +273,7 @@ private static void validateQueryTokens(Map<String, Float> queryTokens) {
}

@Override
protected boolean doEquals(SparseEncodingQueryBuilder obj) {
protected boolean doEquals(NeuralSparseQueryBuilder obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
EqualsBuilder equalsBuilder = new EqualsBuilder().append(fieldName, obj.fieldName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import java.util.stream.Collectors;

/**
* Utility class for working with sparse_encoding queries and ingest processor.
* Utility class for working with neural_sparse queries and ingest processor.
* Used to fetch the (token, weight) Map from the response returned by {@link org.opensearch.neuralsearch.ml.MLCommonsClientAccessor}
*
*/
Expand Down
Loading

0 comments on commit ddb5d69

Please sign in to comment.