Skip to content

Commit

Permalink
Test: bwc test for text chunking processor (#661)
Browse files Browse the repository at this point in the history
* bwc test for text chunking processor

Signed-off-by: yuye-aws <[email protected]>

* spotless apply

Signed-off-by: yuye-aws <[email protected]>

* update changelog

Signed-off-by: yuye-aws <[email protected]>

* spotless apply

Signed-off-by: yuye-aws <[email protected]>

* add test document for restart upgrade

Signed-off-by: yuye-aws <[email protected]>

* rename pipeline configuration file

Signed-off-by: yuye-aws <[email protected]>

* fix pipeline create bug

Signed-off-by: yuye-aws <[email protected]>

* fix pipeline create bug

Signed-off-by: yuye-aws <[email protected]>

* filter tests for lower versions

Signed-off-by: yuye-aws <[email protected]>

* index create in chunking bwc test

Signed-off-by: yuye-aws <[email protected]>

* index create in chunking bwc test

Signed-off-by: yuye-aws <[email protected]>

* index create in chunking bwc test

Signed-off-by: yuye-aws <[email protected]>

* index validate in chunking bwc test

Signed-off-by: yuye-aws <[email protected]>

* filter bwc test for lower version

Signed-off-by: yuye-aws <[email protected]>

* bug fix in document ingestion in text chunking test

Signed-off-by: yuye-aws <[email protected]>

* ensure index creation in text chunking bwc test

Signed-off-by: yuye-aws <[email protected]>

* add comment

Signed-off-by: yuye-aws <[email protected]>

* update index setting

Signed-off-by: yuye-aws <[email protected]>

* update change log

Signed-off-by: yuye-aws <[email protected]>

* update gradle comment format

Signed-off-by: yuye-aws <[email protected]>

* update gradle file format

Signed-off-by: yuye-aws <[email protected]>

* rename bwc test filename

Signed-off-by: yuye-aws <[email protected]>

* update gradle file format

Signed-off-by: yuye-aws <[email protected]>

* update gradle file to filter tests

Signed-off-by: yuye-aws <[email protected]>

* merge method createPipelineProcessorWithoutModelId

Signed-off-by: yuye-aws <[email protected]>

* text chunking processor it: create pipeline method rename

Signed-off-by: yuye-aws <[email protected]>

* fix it failure

Signed-off-by: yuye-aws <[email protected]>

* include index mapping for text chunking index setting

Signed-off-by: yuye-aws <[email protected]>

* update nitpicking

Signed-off-by: yuye-aws <[email protected]>

---------

Signed-off-by: yuye-aws <[email protected]>
(cherry picked from commit e69752c)
  • Loading branch information
yuye-aws committed Apr 23, 2024
1 parent 40782d8 commit c937a2a
Show file tree
Hide file tree
Showing 15 changed files with 317 additions and 31 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.13...2.x)
### Features
### Enhancements
- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
- Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
- Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
### Bug Fixes
Expand Down
18 changes: 16 additions & 2 deletions qa/restart-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
systemProperty 'tests.skip_delete_model_index', 'true'
systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version

//Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// because these features were released in 2.11 version.
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
filter {
Expand All @@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand All @@ -107,7 +114,7 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
systemProperty 'tests.is_old_cluster', 'false'
systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version

//Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// because these features were released in 2.11 version.
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
filter {
Expand All @@ -125,6 +132,13 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import org.junit.Before;
import org.opensearch.common.settings.Settings;
Expand Down Expand Up @@ -99,4 +101,11 @@ protected void createPipelineForSparseEncodingProcessor(final String modelId, fi
);
createPipelineProcessor(requestBody, pipelineName, modelId);
}

protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
String requestBody = Files.readString(
Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
);
createPipelineProcessor(requestBody, pipelineName, "");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import org.opensearch.index.query.MatchAllQueryBuilder;
import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;

public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase {

private static final String PIPELINE_NAME = "pipeline-text-chunking";
private static final String INPUT_FIELD = "body";
private static final String OUTPUT_FIELD = "body_chunk";
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
private static final String TEST_INGEST_TEXT =
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);

// Test rolling-upgrade text chunking processor
// Create Text Chunking Processor, Ingestion Pipeline and add document
// Validate process, pipeline and document count in restart-upgrade scenario
public void testTextChunkingProcessor_E2EFlow() throws Exception {
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
String indexName = getIndexNameForTest();
if (isRunningAgainstOldCluster()) {
createPipelineForTextChunkingProcessor(PIPELINE_NAME);
createChunkingIndex(indexName);
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages);
} else {
try {
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages);
} finally {
wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
}
}
}

private void createChunkingIndex(String indexName) throws Exception {
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
Objects.requireNonNull(documentURLPath);
String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
}

private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
int docCount = getDocCount(indexName);
assertEquals(documentCount, docCount);
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Map<String, Object> searchResults = search(indexName, query, 10);
assertNotNull(searchResults);
Map<String, Object> document = getFirstInnerHit(searchResults);
assertNotNull(document);
Object documentSource = document.get("_source");
assert (documentSource instanceof Map);
@SuppressWarnings("unchecked")
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
assert (documentSourceMap).containsKey(fieldName);
Object ingestOutputs = documentSourceMap.get(fieldName);
assertEquals(expected, ingestOutputs);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"settings":{
"default_pipeline": "%s",
"number_of_shards": 3,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"body": {
"type": "text"
},
"body_chunk": {
"type": "text"
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"description": "An example fixed token length chunker pipeline with standard tokenizer",
"processors" : [
{
"text_chunking": {
"field_map": {
"body": "body_chunk"
},
"algorithm": {
"fixed_token_length": {
"token_limit": 10,
"tokenizer": "standard"
}
}
}
}
]
}
30 changes: 29 additions & 1 deletion qa/rolling-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down Expand Up @@ -126,6 +133,13 @@ task testAgainstOneThirdUpgradedCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand All @@ -150,7 +164,7 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
systemProperty 'tests.skip_delete_model_index', 'true'
systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version

//Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// because these features were released in 2.11 version.
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
filter {
Expand All @@ -168,6 +182,13 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down Expand Up @@ -210,6 +231,13 @@ task testRollingUpgrade(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import org.junit.Before;
import org.opensearch.common.settings.Settings;
Expand Down Expand Up @@ -138,4 +140,11 @@ protected void updateClusterSettings() {
updateClusterSettings("plugins.ml_commons.native_memory_threshold", 100);
updateClusterSettings("plugins.ml_commons.allow_registering_model_via_url", true);
}

protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
String requestBody = Files.readString(
Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
);
createPipelineProcessor(requestBody, pipelineName, "");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class NeuralSparseSearchIT extends AbstractRollingUpgradeTestCase {

// Test rolling-upgrade test sparse embedding processor
// Create Sparse Encoding Processor, Ingestion Pipeline and add document
// Validate process , pipeline and document count in restart-upgrade scenario
// Validate process , pipeline and document count in rolling-upgrade scenario
public void testSparseEncodingProcessor_E2EFlow() throws Exception {
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
switch (getClusterType()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import org.opensearch.index.query.MatchAllQueryBuilder;
import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;

public class TextChunkingProcessorIT extends AbstractRollingUpgradeTestCase {

private static final String PIPELINE_NAME = "pipeline-text-chunking";
private static final String INPUT_FIELD = "body";
private static final String OUTPUT_FIELD = "body_chunk";
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
private static final int NUM_DOCS_PER_ROUND = 1;
private static final String TEST_INGEST_TEXT =
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";

List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);

// Test rolling-upgrade text chunking processor
// Create Text Chunking Processor, Ingestion Pipeline and add document
// Validate process, pipeline and document count in rolling-upgrade scenario
public void testTextChunkingProcessor_E2EFlow() throws Exception {
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
String indexName = getIndexNameForTest();
switch (getClusterType()) {
case OLD:
createPipelineForTextChunkingProcessor(PIPELINE_NAME);
createChunkingIndex(indexName);
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
break;
case MIXED:
int totalDocsCountMixed;
if (isFirstMixedRound()) {
totalDocsCountMixed = NUM_DOCS_PER_ROUND;
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages);
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
} else {
totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND;
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages);
}
break;
case UPGRADED:
try {
int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND;
addDocument(indexName, "2", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountUpgraded, expectedPassages);
} finally {
wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
}
break;
default:
throw new IllegalStateException("Unexpected value: " + getClusterType());
}
}

private void createChunkingIndex(String indexName) throws Exception {
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
Objects.requireNonNull(documentURLPath);
String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
}

private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
int docCount = getDocCount(indexName);
assertEquals(documentCount, docCount);
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Map<String, Object> searchResults = search(indexName, query, 10);
assertNotNull(searchResults);
Map<String, Object> document = getFirstInnerHit(searchResults);
assertNotNull(document);
Object documentSource = document.get("_source");
assert (documentSource instanceof Map);
@SuppressWarnings("unchecked")
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
assert (documentSourceMap).containsKey(fieldName);
Object ingestOutputs = documentSourceMap.get(fieldName);
assertEquals(expected, ingestOutputs);
}
}
Loading

0 comments on commit c937a2a

Please sign in to comment.