-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Test: bwc test for text chunking processor (#661)
* bwc test for text chunking processor Signed-off-by: yuye-aws <[email protected]> * spotless apply Signed-off-by: yuye-aws <[email protected]> * update changelog Signed-off-by: yuye-aws <[email protected]> * spotless apply Signed-off-by: yuye-aws <[email protected]> * add test document for restart upgrade Signed-off-by: yuye-aws <[email protected]> * rename pipeline configuration file Signed-off-by: yuye-aws <[email protected]> * fix pipeline create bug Signed-off-by: yuye-aws <[email protected]> * fix pipeline create bug Signed-off-by: yuye-aws <[email protected]> * filter tests for lower versions Signed-off-by: yuye-aws <[email protected]> * index create in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * index create in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * index create in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * index validate in chunking bwc test Signed-off-by: yuye-aws <[email protected]> * filter bwc test for lower version Signed-off-by: yuye-aws <[email protected]> * bug fix in document ingestion in text chunking test Signed-off-by: yuye-aws <[email protected]> * ensure index creation in text chunking bwc test Signed-off-by: yuye-aws <[email protected]> * add comment Signed-off-by: yuye-aws <[email protected]> * update index setting Signed-off-by: yuye-aws <[email protected]> * update change log Signed-off-by: yuye-aws <[email protected]> * update gradle comment format Signed-off-by: yuye-aws <[email protected]> * update gradle file format Signed-off-by: yuye-aws <[email protected]> * rename bwc test filename Signed-off-by: yuye-aws <[email protected]> * update gradle file format Signed-off-by: yuye-aws <[email protected]> * update gradle file to filter tests Signed-off-by: yuye-aws <[email protected]> * merge method createPipelineProcessorWithoutModelId Signed-off-by: yuye-aws <[email protected]> * text chunking processor it: create pipeline method rename Signed-off-by: yuye-aws <[email protected]> * fix it failure Signed-off-by: yuye-aws <[email protected]> * include index mapping for text chunking index setting Signed-off-by: yuye-aws <[email protected]> * update nitpicking Signed-off-by: yuye-aws <[email protected]> --------- Signed-off-by: yuye-aws <[email protected]> (cherry picked from commit e69752c)
- Loading branch information
Showing
15 changed files
with
317 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
75 changes: 75 additions & 0 deletions
75
...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.bwc; | ||
|
||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
import org.opensearch.index.query.MatchAllQueryBuilder; | ||
import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER; | ||
|
||
public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase { | ||
|
||
private static final String PIPELINE_NAME = "pipeline-text-chunking"; | ||
private static final String INPUT_FIELD = "body"; | ||
private static final String OUTPUT_FIELD = "body_chunk"; | ||
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json"; | ||
private static final String TEST_INGEST_TEXT = | ||
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; | ||
List<String> expectedPassages = List.of( | ||
"This is an example document to be chunked. The document ", | ||
"contains a single paragraph, two sentences and 24 tokens by ", | ||
"standard tokenizer in OpenSearch." | ||
); | ||
|
||
// Test rolling-upgrade text chunking processor | ||
// Create Text Chunking Processor, Ingestion Pipeline and add document | ||
// Validate process, pipeline and document count in restart-upgrade scenario | ||
public void testTextChunkingProcessor_E2EFlow() throws Exception { | ||
waitForClusterHealthGreen(NODES_BWC_CLUSTER); | ||
String indexName = getIndexNameForTest(); | ||
if (isRunningAgainstOldCluster()) { | ||
createPipelineForTextChunkingProcessor(PIPELINE_NAME); | ||
createChunkingIndex(indexName); | ||
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages); | ||
} else { | ||
try { | ||
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages); | ||
} finally { | ||
wipeOfTestResources(indexName, PIPELINE_NAME, null, null); | ||
} | ||
} | ||
} | ||
|
||
private void createChunkingIndex(String indexName) throws Exception { | ||
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH); | ||
Objects.requireNonNull(documentURLPath); | ||
String indexSetting = Files.readString(Path.of(documentURLPath.toURI())); | ||
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME); | ||
} | ||
|
||
private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) { | ||
int docCount = getDocCount(indexName); | ||
assertEquals(documentCount, docCount); | ||
MatchAllQueryBuilder query = new MatchAllQueryBuilder(); | ||
Map<String, Object> searchResults = search(indexName, query, 10); | ||
assertNotNull(searchResults); | ||
Map<String, Object> document = getFirstInnerHit(searchResults); | ||
assertNotNull(document); | ||
Object documentSource = document.get("_source"); | ||
assert (documentSource instanceof Map); | ||
@SuppressWarnings("unchecked") | ||
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource; | ||
assert (documentSourceMap).containsKey(fieldName); | ||
Object ingestOutputs = documentSourceMap.get(fieldName); | ||
assertEquals(expected, ingestOutputs); | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"settings":{ | ||
"default_pipeline": "%s", | ||
"number_of_shards": 3, | ||
"number_of_replicas": 1 | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"body": { | ||
"type": "text" | ||
}, | ||
"body_chunk": { | ||
"type": "text" | ||
} | ||
} | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
...t-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"description": "An example fixed token length chunker pipeline with standard tokenizer", | ||
"processors" : [ | ||
{ | ||
"text_chunking": { | ||
"field_map": { | ||
"body": "body_chunk" | ||
}, | ||
"algorithm": { | ||
"fixed_token_length": { | ||
"token_limit": 10, | ||
"tokenizer": "standard" | ||
} | ||
} | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
93 changes: 93 additions & 0 deletions
93
...olling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.bwc; | ||
|
||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
import org.opensearch.index.query.MatchAllQueryBuilder; | ||
import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER; | ||
|
||
public class TextChunkingProcessorIT extends AbstractRollingUpgradeTestCase { | ||
|
||
private static final String PIPELINE_NAME = "pipeline-text-chunking"; | ||
private static final String INPUT_FIELD = "body"; | ||
private static final String OUTPUT_FIELD = "body_chunk"; | ||
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json"; | ||
private static final int NUM_DOCS_PER_ROUND = 1; | ||
private static final String TEST_INGEST_TEXT = | ||
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; | ||
|
||
List<String> expectedPassages = List.of( | ||
"This is an example document to be chunked. The document ", | ||
"contains a single paragraph, two sentences and 24 tokens by ", | ||
"standard tokenizer in OpenSearch." | ||
); | ||
|
||
// Test rolling-upgrade text chunking processor | ||
// Create Text Chunking Processor, Ingestion Pipeline and add document | ||
// Validate process, pipeline and document count in rolling-upgrade scenario | ||
public void testTextChunkingProcessor_E2EFlow() throws Exception { | ||
waitForClusterHealthGreen(NODES_BWC_CLUSTER); | ||
String indexName = getIndexNameForTest(); | ||
switch (getClusterType()) { | ||
case OLD: | ||
createPipelineForTextChunkingProcessor(PIPELINE_NAME); | ||
createChunkingIndex(indexName); | ||
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
break; | ||
case MIXED: | ||
int totalDocsCountMixed; | ||
if (isFirstMixedRound()) { | ||
totalDocsCountMixed = NUM_DOCS_PER_ROUND; | ||
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages); | ||
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
} else { | ||
totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND; | ||
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages); | ||
} | ||
break; | ||
case UPGRADED: | ||
try { | ||
int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND; | ||
addDocument(indexName, "2", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountUpgraded, expectedPassages); | ||
} finally { | ||
wipeOfTestResources(indexName, PIPELINE_NAME, null, null); | ||
} | ||
break; | ||
default: | ||
throw new IllegalStateException("Unexpected value: " + getClusterType()); | ||
} | ||
} | ||
|
||
private void createChunkingIndex(String indexName) throws Exception { | ||
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH); | ||
Objects.requireNonNull(documentURLPath); | ||
String indexSetting = Files.readString(Path.of(documentURLPath.toURI())); | ||
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME); | ||
} | ||
|
||
private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) { | ||
int docCount = getDocCount(indexName); | ||
assertEquals(documentCount, docCount); | ||
MatchAllQueryBuilder query = new MatchAllQueryBuilder(); | ||
Map<String, Object> searchResults = search(indexName, query, 10); | ||
assertNotNull(searchResults); | ||
Map<String, Object> document = getFirstInnerHit(searchResults); | ||
assertNotNull(document); | ||
Object documentSource = document.get("_source"); | ||
assert (documentSource instanceof Map); | ||
@SuppressWarnings("unchecked") | ||
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource; | ||
assert (documentSourceMap).containsKey(fieldName); | ||
Object ingestOutputs = documentSourceMap.get(fieldName); | ||
assertEquals(expected, ingestOutputs); | ||
} | ||
} |
Oops, something went wrong.