-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
16 changed files
with
322 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
75 changes: 75 additions & 0 deletions
75
...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.bwc; | ||
|
||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
import org.opensearch.index.query.MatchAllQueryBuilder; | ||
import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER; | ||
|
||
public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase { | ||
|
||
private static final String PIPELINE_NAME = "pipeline-text-chunking"; | ||
private static final String INPUT_FIELD = "body"; | ||
private static final String OUTPUT_FIELD = "body_chunk"; | ||
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json"; | ||
private static final String TEST_INGEST_TEXT = | ||
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; | ||
List<String> expectedPassages = List.of( | ||
"This is an example document to be chunked. The document ", | ||
"contains a single paragraph, two sentences and 24 tokens by ", | ||
"standard tokenizer in OpenSearch." | ||
); | ||
|
||
// Test rolling-upgrade text chunking processor | ||
// Create Text Chunking Processor, Ingestion Pipeline and add document | ||
// Validate process, pipeline and document count in restart-upgrade scenario | ||
public void testTextChunkingProcessor_E2EFlow() throws Exception { | ||
waitForClusterHealthGreen(NODES_BWC_CLUSTER); | ||
String indexName = getIndexNameForTest(); | ||
if (isRunningAgainstOldCluster()) { | ||
createPipelineForTextChunkingProcessor(PIPELINE_NAME); | ||
createChunkingIndex(indexName); | ||
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages); | ||
} else { | ||
try { | ||
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages); | ||
} finally { | ||
wipeOfTestResources(indexName, PIPELINE_NAME, null, null); | ||
} | ||
} | ||
} | ||
|
||
private void createChunkingIndex(String indexName) throws Exception { | ||
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH); | ||
Objects.requireNonNull(documentURLPath); | ||
String indexSetting = Files.readString(Path.of(documentURLPath.toURI())); | ||
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME); | ||
} | ||
|
||
private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) { | ||
int docCount = getDocCount(indexName); | ||
assertEquals(documentCount, docCount); | ||
MatchAllQueryBuilder query = new MatchAllQueryBuilder(); | ||
Map<String, Object> searchResults = search(indexName, query, 10); | ||
assertNotNull(searchResults); | ||
Map<String, Object> document = getFirstInnerHit(searchResults); | ||
assertNotNull(document); | ||
Object documentSource = document.get("_source"); | ||
assert (documentSource instanceof Map); | ||
@SuppressWarnings("unchecked") | ||
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource; | ||
assert (documentSourceMap).containsKey(fieldName); | ||
Object ingestOutputs = documentSourceMap.get(fieldName); | ||
assertEquals(expected, ingestOutputs); | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"settings":{ | ||
"default_pipeline": "%s", | ||
"number_of_shards": 3, | ||
"number_of_replicas": 1 | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"body": { | ||
"type": "text" | ||
}, | ||
"body_chunk": { | ||
"type": "text" | ||
} | ||
} | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
...t-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"description": "An example fixed token length chunker pipeline with standard tokenizer", | ||
"processors" : [ | ||
{ | ||
"text_chunking": { | ||
"field_map": { | ||
"body": "body_chunk" | ||
}, | ||
"algorithm": { | ||
"fixed_token_length": { | ||
"token_limit": 10, | ||
"tokenizer": "standard" | ||
} | ||
} | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
93 changes: 93 additions & 0 deletions
93
...olling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.bwc; | ||
|
||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
import org.opensearch.index.query.MatchAllQueryBuilder; | ||
import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER; | ||
|
||
public class TextChunkingProcessorIT extends AbstractRollingUpgradeTestCase { | ||
|
||
private static final String PIPELINE_NAME = "pipeline-text-chunking"; | ||
private static final String INPUT_FIELD = "body"; | ||
private static final String OUTPUT_FIELD = "body_chunk"; | ||
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json"; | ||
private static final int NUM_DOCS_PER_ROUND = 1; | ||
private static final String TEST_INGEST_TEXT = | ||
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; | ||
|
||
List<String> expectedPassages = List.of( | ||
"This is an example document to be chunked. The document ", | ||
"contains a single paragraph, two sentences and 24 tokens by ", | ||
"standard tokenizer in OpenSearch." | ||
); | ||
|
||
// Test rolling-upgrade text chunking processor | ||
// Create Text Chunking Processor, Ingestion Pipeline and add document | ||
// Validate process, pipeline and document count in rolling-upgrade scenario | ||
public void testTextChunkingProcessor_E2EFlow() throws Exception { | ||
waitForClusterHealthGreen(NODES_BWC_CLUSTER); | ||
String indexName = getIndexNameForTest(); | ||
switch (getClusterType()) { | ||
case OLD: | ||
createPipelineForTextChunkingProcessor(PIPELINE_NAME); | ||
createChunkingIndex(indexName); | ||
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
break; | ||
case MIXED: | ||
int totalDocsCountMixed; | ||
if (isFirstMixedRound()) { | ||
totalDocsCountMixed = NUM_DOCS_PER_ROUND; | ||
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages); | ||
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
} else { | ||
totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND; | ||
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages); | ||
} | ||
break; | ||
case UPGRADED: | ||
try { | ||
int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND; | ||
addDocument(indexName, "2", INPUT_FIELD, TEST_INGEST_TEXT, null, null); | ||
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountUpgraded, expectedPassages); | ||
} finally { | ||
wipeOfTestResources(indexName, PIPELINE_NAME, null, null); | ||
} | ||
break; | ||
default: | ||
throw new IllegalStateException("Unexpected value: " + getClusterType()); | ||
} | ||
} | ||
|
||
private void createChunkingIndex(String indexName) throws Exception { | ||
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH); | ||
Objects.requireNonNull(documentURLPath); | ||
String indexSetting = Files.readString(Path.of(documentURLPath.toURI())); | ||
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME); | ||
} | ||
|
||
private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) { | ||
int docCount = getDocCount(indexName); | ||
assertEquals(documentCount, docCount); | ||
MatchAllQueryBuilder query = new MatchAllQueryBuilder(); | ||
Map<String, Object> searchResults = search(indexName, query, 10); | ||
assertNotNull(searchResults); | ||
Map<String, Object> document = getFirstInnerHit(searchResults); | ||
assertNotNull(document); | ||
Object documentSource = document.get("_source"); | ||
assert (documentSource instanceof Map); | ||
@SuppressWarnings("unchecked") | ||
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource; | ||
assert (documentSourceMap).containsKey(fieldName); | ||
Object ingestOutputs = documentSourceMap.get(fieldName); | ||
assertEquals(expected, ingestOutputs); | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
qa/rolling-upgrade/src/test/resources/processor/ChunkingIndexSettings.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"settings":{ | ||
"default_pipeline": "%s", | ||
"number_of_shards": 3, | ||
"number_of_replicas": 1 | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"body": { | ||
"type": "text" | ||
}, | ||
"body_chunk": { | ||
"type": "text" | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.