From c5e5a293f076f76e3311926029583f20519ce6a1 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Tue, 19 Nov 2024 15:58:35 +0800 Subject: [PATCH] add it Signed-off-by: zhichao-aws --- .../processor/SparseEncodingProcessIT.java | 30 +++++++++++++++++++ ...ncodingPipelineConfigurationWithPrune.json | 21 +++++++++++++ .../UploadSparseEncodingModelRequestBody.json | 10 ++----- .../neuralsearch/BaseNeuralSearchIT.java | 7 +++-- 4 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 src/test/resources/processor/SparseEncodingPipelineConfigurationWithPrune.json diff --git a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessIT.java b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessIT.java index 349da1033..83b680d19 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessIT.java @@ -18,6 +18,7 @@ import org.opensearch.neuralsearch.BaseNeuralSearchIT; import com.google.common.collect.ImmutableList; +import org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder; public class SparseEncodingProcessIT extends BaseNeuralSearchIT { @@ -39,6 +40,35 @@ public void testSparseEncodingProcessor() throws Exception { createSparseEncodingIndex(); ingestDocument(); assertEquals(1, getDocCount(INDEX_NAME)); + + NeuralSparseQueryBuilder neuralSparseQueryBuilder = new NeuralSparseQueryBuilder(); + neuralSparseQueryBuilder.fieldName("title_sparse"); + neuralSparseQueryBuilder.queryTokensSupplier(() -> Map.of("good", 1.0f, "a", 2.0f)); + Map searchResponse = search(INDEX_NAME, neuralSparseQueryBuilder, 2); + assertFalse(searchResponse.isEmpty()); + double maxScore = (Double) ((Map) searchResponse.get("hits")).get("max_score"); + assertEquals(4.4433594, maxScore, 1e-3); + } finally { + wipeOfTestResources(INDEX_NAME, PIPELINE_NAME, modelId, null); + } + } + + public void testSparseEncodingProcessorWithPrune() throws Exception { + String modelId = null; + try { + modelId = prepareSparseEncodingModel(); + createPipelineProcessor(modelId, PIPELINE_NAME, ProcessorType.SPARSE_ENCODING_PRUNE); + createSparseEncodingIndex(); + ingestDocument(); + assertEquals(1, getDocCount(INDEX_NAME)); + + NeuralSparseQueryBuilder neuralSparseQueryBuilder = new NeuralSparseQueryBuilder(); + neuralSparseQueryBuilder.fieldName("title_sparse"); + neuralSparseQueryBuilder.queryTokensSupplier(() -> Map.of("good", 1.0f, "a", 2.0f)); + Map searchResponse = search(INDEX_NAME, neuralSparseQueryBuilder, 2); + assertFalse(searchResponse.isEmpty()); + double maxScore = (Double) ((Map) searchResponse.get("hits")).get("max_score"); + assertEquals(3.640625, maxScore, 1e-3); } finally { wipeOfTestResources(INDEX_NAME, PIPELINE_NAME, modelId, null); } diff --git a/src/test/resources/processor/SparseEncodingPipelineConfigurationWithPrune.json b/src/test/resources/processor/SparseEncodingPipelineConfigurationWithPrune.json new file mode 100644 index 000000000..642228e06 --- /dev/null +++ b/src/test/resources/processor/SparseEncodingPipelineConfigurationWithPrune.json @@ -0,0 +1,21 @@ +{ + "description": "An example sparse Encoding pipeline", + "processors" : [ + { + "sparse_encoding": { + "model_id": "%s", + "batch_size": "%d", + "prune_type": "max_ratio", + "prune_ratio": 0.8, + "field_map": { + "title": "title_sparse", + "favor_list": "favor_list_sparse", + "favorites": { + "game": "game_sparse", + "movie": "movie_sparse" + } + } + } + } + ] +} diff --git a/src/test/resources/processor/UploadSparseEncodingModelRequestBody.json b/src/test/resources/processor/UploadSparseEncodingModelRequestBody.json index 5c2a73f6e..6bdac87c5 100644 --- a/src/test/resources/processor/UploadSparseEncodingModelRequestBody.json +++ b/src/test/resources/processor/UploadSparseEncodingModelRequestBody.json @@ -1,10 +1,6 @@ { - "name": "tokenize-idf-0915", - "version": "1.0.0", - "function_name": "SPARSE_TOKENIZE", - "description": "test model", - "model_format": "TORCH_SCRIPT", + "name": "amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1", + "version": "1.0.1", "model_group_id": "%s", - "model_content_hash_value": "b345e9e943b62c405a8dd227ef2c46c84c5ff0a0b71b584be9132b37bce91a9a", - "url": "https://github.com/opensearch-project/ml-commons/raw/main/ml-algorithms/src/test/resources/org/opensearch/ml/engine/algorithms/sparse_encoding/sparse_demo.zip" + "model_format": "TORCH_SCRIPT" } diff --git a/src/testFixtures/java/org/opensearch/neuralsearch/BaseNeuralSearchIT.java b/src/testFixtures/java/org/opensearch/neuralsearch/BaseNeuralSearchIT.java index afc545447..4bfb9d8c0 100644 --- a/src/testFixtures/java/org/opensearch/neuralsearch/BaseNeuralSearchIT.java +++ b/src/testFixtures/java/org/opensearch/neuralsearch/BaseNeuralSearchIT.java @@ -85,7 +85,9 @@ public abstract class BaseNeuralSearchIT extends OpenSearchSecureRestTestCase { ProcessorType.TEXT_IMAGE_EMBEDDING, "processor/PipelineForTextImageEmbeddingProcessorConfiguration.json", ProcessorType.TEXT_EMBEDDING_WITH_NESTED_FIELDS_MAPPING, - "processor/PipelineConfigurationWithNestedFieldsMapping.json" + "processor/PipelineConfigurationWithNestedFieldsMapping.json", + ProcessorType.SPARSE_ENCODING_PRUNE, + "processor/SparseEncodingPipelineConfigurationWithPrune.json" ); private static final Set SUCCESS_STATUSES = Set.of(RestStatus.CREATED, RestStatus.OK); protected static final String CONCURRENT_SEGMENT_SEARCH_ENABLED = "search.concurrent_segment_search.enabled"; @@ -1439,6 +1441,7 @@ protected enum ProcessorType { TEXT_EMBEDDING, TEXT_EMBEDDING_WITH_NESTED_FIELDS_MAPPING, TEXT_IMAGE_EMBEDDING, - SPARSE_ENCODING + SPARSE_ENCODING, + SPARSE_ENCODING_PRUNE } }