Test: bwc test for text chunking processor (#661)

* bwc test for text chunking processor Signed-off-by: yuye-aws <[email protected]> * spotless apply Signed-off-by: yuye-aws <[email protected]> * update changelog Signed-off-by: yuye-aws <[email protected]> * spotless apply Signed-off-by: yuye-aws <[email protected]> * add test document for restart upgrade Signed-off-by: yuye-aws <[email protected]> * rename pipeline configuration file Signed-off-by: yuye-aws <[email protected]> * fix pipeline create bug Signed-off-by: yuye-aws <[email protected]> * fix pipeline create bug Signed-off-by: yuye-aws <[email protected]> * filter tests for lower versions Signed-off-by: yuye-aws <[email protected]> * index create in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * index create in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * index create in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * index validate in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * filter bwc test for lower version Signed-off-by: yuye-aws <[email protected]> * bug fix in document ingestion in text chunking test Signed-off-by: yuye-aws <[email protected]> * ensure index creation in text chunking bwc test Signed-off-by: yuye-aws <[email protected]> * add comment Signed-off-by: yuye-aws <[email protected]> * update index setting Signed-off-by: yuye-aws <[email protected]> * update change log Signed-off-by: yuye-aws <[email protected]> * update gradle comment format Signed-off-by: yuye-aws <[email protected]> * update gradle file format Signed-off-by: yuye-aws <[email protected]> * rename bwc test filename Signed-off-by: yuye-aws <[email protected]> * update gradle file format Signed-off-by: yuye-aws <[email protected]> * update gradle file to filter tests Signed-off-by: yuye-aws <[email protected]> * merge method createPipelineProcessorWithoutModelId Signed-off-by: yuye-aws <[email protected]> * text chunking processor it: create pipeline method rename Signed-off-by: yuye-aws <[email protected]> * fix it failure Signed-off-by: yuye-aws <[email protected]> * include index mapping for text chunking index setting Signed-off-by: yuye-aws <[email protected]> * update nitpicking Signed-off-by: yuye-aws <[email protected]> --------- Signed-off-by: yuye-aws <[email protected]> (cherry picked from commit e69752c)
opensearch-project · Apr 23, 2024 · c937a2a · c937a2a
1 parent 40782d8
commit c937a2a
Show file tree

Hide file tree

Showing 15 changed files with 317 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.13...2.x)
 ### Features
 ### Enhancements
+- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
 - Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
 - Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
 ### Bug Fixes

diff --git a/qa/restart-upgrade/build.gradle b/qa/restart-upgrade/build.gradle
@@ -65,7 +65,7 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
     systemProperty 'tests.skip_delete_model_index', 'true'
     systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version
 
-    //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
+    // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
     // because these features were released in 2.11 version.
     if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
         filter {
@@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'
@@ -107,7 +114,7 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
     systemProperty 'tests.is_old_cluster', 'false'
     systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version
 
-    //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
+    // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
     // because these features were released in 2.11 version.
     if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
         filter {
@@ -125,6 +132,13 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'

diff --git a/...ade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java b/...ade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java
@@ -4,9 +4,11 @@
  */
 package org.opensearch.neuralsearch.bwc;
 
+import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Locale;
+import java.util.Objects;
 import java.util.Optional;
 import org.junit.Before;
 import org.opensearch.common.settings.Settings;
@@ -99,4 +101,11 @@ protected void createPipelineForSparseEncodingProcessor(final String modelId, fi
         );
         createPipelineProcessor(requestBody, pipelineName, modelId);
     }
+
+    protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
+        String requestBody = Files.readString(
+            Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
+        );
+        createPipelineProcessor(requestBody, pipelineName, "");
+    }
 }
diff --git a/...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java b/...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.bwc;
+
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import org.opensearch.index.query.MatchAllQueryBuilder;
+import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
+
+public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase {
+
+    private static final String PIPELINE_NAME = "pipeline-text-chunking";
+    private static final String INPUT_FIELD = "body";
+    private static final String OUTPUT_FIELD = "body_chunk";
+    private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
+    private static final String TEST_INGEST_TEXT =
+        "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
+    List<String> expectedPassages = List.of(
+        "This is an example document to be chunked. The document ",
+        "contains a single paragraph, two sentences and 24 tokens by ",
+        "standard tokenizer in OpenSearch."
+    );
+
+    // Test rolling-upgrade text chunking processor
+    // Create Text Chunking Processor, Ingestion Pipeline and add document
+    // Validate process, pipeline and document count in restart-upgrade scenario
+    public void testTextChunkingProcessor_E2EFlow() throws Exception {
+        waitForClusterHealthGreen(NODES_BWC_CLUSTER);
+        String indexName = getIndexNameForTest();
+        if (isRunningAgainstOldCluster()) {
+            createPipelineForTextChunkingProcessor(PIPELINE_NAME);
+            createChunkingIndex(indexName);
+            addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+            validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages);
+        } else {
+            try {
+                addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+                validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages);
+            } finally {
+                wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
+            }
+        }
+    }
+
+    private void createChunkingIndex(String indexName) throws Exception {
+        URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
+        Objects.requireNonNull(documentURLPath);
+        String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
+        createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
+    }
+
+    private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
+        int docCount = getDocCount(indexName);
+        assertEquals(documentCount, docCount);
+        MatchAllQueryBuilder query = new MatchAllQueryBuilder();
+        Map<String, Object> searchResults = search(indexName, query, 10);
+        assertNotNull(searchResults);
+        Map<String, Object> document = getFirstInnerHit(searchResults);
+        assertNotNull(document);
+        Object documentSource = document.get("_source");
+        assert (documentSource instanceof Map);
+        @SuppressWarnings("unchecked")
+        Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
+        assert (documentSourceMap).containsKey(fieldName);
+        Object ingestOutputs = documentSourceMap.get(fieldName);
+        assertEquals(expected, ingestOutputs);
+    }
+}
diff --git a/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json b/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json
@@ -0,0 +1,17 @@
+{
+  "settings":{
+    "default_pipeline": "%s",
+    "number_of_shards": 3,
+    "number_of_replicas": 1
+  },
+  "mappings": {
+    "properties": {
+      "body": {
+        "type": "text"
+      },
+      "body_chunk": {
+        "type": "text"
+      }
+    }
+  }
+}
diff --git a/...t-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json b/...t-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json
@@ -0,0 +1,18 @@
+{
+  "description": "An example fixed token length chunker pipeline with standard tokenizer",
+  "processors" : [
+    {
+      "text_chunking": {
+        "field_map": {
+          "body": "body_chunk"
+        },
+        "algorithm": {
+          "fixed_token_length": {
+            "token_limit": 10,
+            "tokenizer": "standard"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/qa/rolling-upgrade/build.gradle b/qa/rolling-upgrade/build.gradle
@@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'
@@ -126,6 +133,13 @@ task testAgainstOneThirdUpgradedCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'
@@ -150,7 +164,7 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
     systemProperty 'tests.skip_delete_model_index', 'true'
     systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version
 
-    //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
+    // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
     // because these features were released in 2.11 version.
     if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
         filter {
@@ -168,6 +182,13 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'
@@ -210,6 +231,13 @@ task testRollingUpgrade(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'

diff --git a/...upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java b/...upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java
@@ -4,9 +4,11 @@
  */
 package org.opensearch.neuralsearch.bwc;
 
+import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Locale;
+import java.util.Objects;
 import java.util.Optional;
 import org.junit.Before;
 import org.opensearch.common.settings.Settings;
@@ -138,4 +140,11 @@ protected void updateClusterSettings() {
         updateClusterSettings("plugins.ml_commons.native_memory_threshold", 100);
         updateClusterSettings("plugins.ml_commons.allow_registering_model_via_url", true);
     }
+
+    protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
+        String requestBody = Files.readString(
+            Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
+        );
+        createPipelineProcessor(requestBody, pipelineName, "");
+    }
 }
diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java
@@ -36,7 +36,7 @@ public class NeuralSparseSearchIT extends AbstractRollingUpgradeTestCase {
 
     // Test rolling-upgrade test sparse embedding processor
     // Create Sparse Encoding Processor, Ingestion Pipeline and add document
-    // Validate process , pipeline and document count in restart-upgrade scenario
+    // Validate process , pipeline and document count in rolling-upgrade scenario
     public void testSparseEncodingProcessor_E2EFlow() throws Exception {
         waitForClusterHealthGreen(NODES_BWC_CLUSTER);
         switch (getClusterType()) {

diff --git a/...olling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java b/...olling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.bwc;
+
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import org.opensearch.index.query.MatchAllQueryBuilder;
+import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
+
+public class TextChunkingProcessorIT extends AbstractRollingUpgradeTestCase {
+
+    private static final String PIPELINE_NAME = "pipeline-text-chunking";
+    private static final String INPUT_FIELD = "body";
+    private static final String OUTPUT_FIELD = "body_chunk";
+    private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
+    private static final int NUM_DOCS_PER_ROUND = 1;
+    private static final String TEST_INGEST_TEXT =
+        "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
+
+    List<String> expectedPassages = List.of(
+        "This is an example document to be chunked. The document ",
+        "contains a single paragraph, two sentences and 24 tokens by ",
+        "standard tokenizer in OpenSearch."
+    );
+
+    // Test rolling-upgrade text chunking processor
+    // Create Text Chunking Processor, Ingestion Pipeline and add document
+    // Validate process, pipeline and document count in rolling-upgrade scenario
+    public void testTextChunkingProcessor_E2EFlow() throws Exception {
+        waitForClusterHealthGreen(NODES_BWC_CLUSTER);
+        String indexName = getIndexNameForTest();
+        switch (getClusterType()) {
+            case OLD:
+                createPipelineForTextChunkingProcessor(PIPELINE_NAME);
+                createChunkingIndex(indexName);
+                addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+                break;
+            case MIXED:
+                int totalDocsCountMixed;
+                if (isFirstMixedRound()) {
+                    totalDocsCountMixed = NUM_DOCS_PER_ROUND;
+                    validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages);
+                    addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+                } else {
+                    totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND;
+                    validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages);
+                }
+                break;
+            case UPGRADED:
+                try {
+                    int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND;
+                    addDocument(indexName, "2", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+                    validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountUpgraded, expectedPassages);
+                } finally {
+                    wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
+                }
+                break;
+            default:
+                throw new IllegalStateException("Unexpected value: " + getClusterType());
+        }
+    }
+
+    private void createChunkingIndex(String indexName) throws Exception {
+        URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
+        Objects.requireNonNull(documentURLPath);
+        String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
+        createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
+    }
+
+    private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
+        int docCount = getDocCount(indexName);
+        assertEquals(documentCount, docCount);
+        MatchAllQueryBuilder query = new MatchAllQueryBuilder();
+        Map<String, Object> searchResults = search(indexName, query, 10);
+        assertNotNull(searchResults);
+        Map<String, Object> document = getFirstInnerHit(searchResults);
+        assertNotNull(document);
+        Object documentSource = document.get("_source");
+        assert (documentSource instanceof Map);
+        @SuppressWarnings("unchecked")
+        Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
+        assert (documentSourceMap).containsKey(fieldName);
+        Object ingestOutputs = documentSourceMap.get(fieldName);
+        assertEquals(expected, ingestOutputs);
+    }
+}