-
Notifications
You must be signed in to change notification settings - Fork 136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merge efficient filtering from feature branch #588
Changes from 6 commits
4c8cf93
47b9ad4
2e18ae8
9b32e17
44f10de
52e2b6b
a2b92b1
d015c35
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.knn.bwc; | ||
|
||
import org.opensearch.knn.TestUtils; | ||
import org.opensearch.knn.index.query.KNNQueryBuilder; | ||
import org.opensearch.index.query.QueryBuilders; | ||
import org.opensearch.index.query.TermQueryBuilder; | ||
|
||
import org.opensearch.client.Request; | ||
import org.opensearch.client.ResponseException; | ||
import org.opensearch.common.Strings; | ||
import org.opensearch.common.xcontent.ToXContent; | ||
import org.opensearch.common.xcontent.XContentBuilder; | ||
import org.opensearch.common.xcontent.XContentFactory; | ||
|
||
import java.io.IOException; | ||
|
||
import static org.opensearch.knn.TestUtils.NODES_BWC_CLUSTER; | ||
|
||
/** | ||
* Tests scenarios specific to filtering functionality in k-NN in case Lucene is set as an engine | ||
*/ | ||
public class LuceneFilteringIT extends AbstractRollingUpgradeTestCase { | ||
private static final String TEST_FIELD = "test-field"; | ||
private static final int DIMENSIONS = 50; | ||
private static final int K = 10; | ||
private static final int NUM_DOCS = 100; | ||
private static final TermQueryBuilder TERM_QUERY = QueryBuilders.termQuery("_id", "100"); | ||
|
||
public void testLuceneFiltering() throws Exception { | ||
waitForClusterHealthGreen(NODES_BWC_CLUSTER); | ||
float[] queryVector = TestUtils.getQueryVectors(1, DIMENSIONS, NUM_DOCS, true)[0]; | ||
switch (getClusterType()) { | ||
case OLD: | ||
createKnnIndex(testIndex, getKNNDefaultIndexSettings(), createKnnIndexMappingWithLuceneField(TEST_FIELD, DIMENSIONS)); | ||
bulkAddKnnDocs(testIndex, TEST_FIELD, TestUtils.getIndexVectors(NUM_DOCS, DIMENSIONS, true), NUM_DOCS); | ||
validateSearchKNNIndexFailed(testIndex, new KNNQueryBuilder(TEST_FIELD, queryVector, K, TERM_QUERY), K); | ||
break; | ||
case MIXED: | ||
validateSearchKNNIndexFailed(testIndex, new KNNQueryBuilder(TEST_FIELD, queryVector, K, TERM_QUERY), K); | ||
break; | ||
case UPGRADED: | ||
searchKNNIndex(testIndex, new KNNQueryBuilder(TEST_FIELD, queryVector, K, TERM_QUERY), K); | ||
deleteKNNIndex(testIndex); | ||
break; | ||
} | ||
} | ||
|
||
protected String createKnnIndexMappingWithLuceneField(final String fieldName, int dimension) throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, let me switch to existing method |
||
return Strings.toString( | ||
XContentFactory.jsonBuilder() | ||
.startObject() | ||
.startObject("properties") | ||
.startObject(fieldName) | ||
.field("type", "knn_vector") | ||
.field("dimension", Integer.toString(dimension)) | ||
.startObject("method") | ||
.field("name", "hnsw") | ||
.field("engine", "lucene") | ||
.field("space_type", "l2") | ||
.endObject() | ||
.endObject() | ||
.endObject() | ||
.endObject() | ||
); | ||
} | ||
|
||
private void validateSearchKNNIndexFailed(String index, KNNQueryBuilder knnQueryBuilder, int resultSize) throws IOException { | ||
XContentBuilder builder = XContentFactory.jsonBuilder().startObject().startObject("query"); | ||
knnQueryBuilder.doXContent(builder, ToXContent.EMPTY_PARAMS); | ||
builder.endObject().endObject(); | ||
|
||
Request request = new Request("POST", "/" + index + "/_search"); | ||
|
||
request.addParameter("size", Integer.toString(resultSize)); | ||
request.addParameter("explain", Boolean.toString(true)); | ||
request.addParameter("search_type", "query_then_fetch"); | ||
request.setJsonEntity(Strings.toString(builder)); | ||
|
||
expectThrows(ResponseException.class, () -> client().performRequest(request)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if this fails with another unrelated exception? i.e. are we sure this will work in a valid case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible to get same exception type for a different cause. Let me add one more assert for the error message, I think that should be specific to our case |
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.knn.index; | ||
|
||
import lombok.AccessLevel; | ||
import lombok.NoArgsConstructor; | ||
import lombok.extern.log4j.Log4j2; | ||
import org.opensearch.Version; | ||
import org.opensearch.cluster.service.ClusterService; | ||
|
||
/** | ||
* Class abstracts information related to underlying OpenSearch cluster | ||
*/ | ||
@NoArgsConstructor(access = AccessLevel.PRIVATE) | ||
@Log4j2 | ||
public class KNNClusterContext { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems to me this is a utility or a helper. Context objects from what I have seen in OpenSearch usually have state associated with them. Is there a better name for this class? Maybe like KNNClusterUtility? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, we can change/extend it in case we have more functionality or state in future. I like KNNClusterUtility, once small doubt is that in existing k-nn codebase we do have prefix as "Util", so I'll go with KNNClusterUtil |
||
|
||
private ClusterService clusterService; | ||
private static KNNClusterContext instance; | ||
|
||
/** | ||
* Return instance of the cluster context, must be initialized first for proper usage | ||
* @return instance of cluster context | ||
*/ | ||
public static synchronized KNNClusterContext instance() { | ||
if (instance == null) { | ||
instance = new KNNClusterContext(); | ||
} | ||
return instance; | ||
} | ||
|
||
/** | ||
* Initializes instance of cluster context by injecting dependencies | ||
* @param clusterService | ||
*/ | ||
public void initialize(final ClusterService clusterService) { | ||
this.clusterService = clusterService; | ||
} | ||
|
||
/** | ||
* Return minimal OpenSearch version based on all nodes currently discoverable in the cluster | ||
* @return minimal installed OpenSearch version, default to Version.CURRENT which is typically the latest version | ||
*/ | ||
public Version getClusterMinVersion() { | ||
try { | ||
return this.clusterService.state().getNodes().getMinNodeVersion(); | ||
} catch (Exception exception) { | ||
log.error( | ||
String.format("Failed to get cluster minimum node version, returning current node version %s instead.", Version.CURRENT), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What was the reasoning here to return Current as opposed to propagating the error up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking on keeping system running, the only reason I can imagine for this.clusterService.state().getNodes().getMinNodeVersion(); call to fail is related to network/transport issues that are stoping one or more nodes from submitting their version info to the cluster state. As we're not changing system state in knn query workflow and we're doing this check for all knn queries even without filter field, I think it's better to assume current version rather than failing. |
||
exception | ||
); | ||
return Version.CURRENT; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.extern.log4j.Log4j2; | ||
import org.apache.lucene.codecs.KnnVectorsFormat; | ||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; | ||
import org.opensearch.index.mapper.MapperService; | ||
import org.opensearch.knn.common.KNNConstants; | ||
import org.opensearch.knn.index.mapper.KNNVectorFieldMapper; | ||
|
||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.function.BiFunction; | ||
import java.util.function.Supplier; | ||
|
||
/** | ||
* Base class for PerFieldKnnVectorsFormat, builds KnnVectorsFormat based on specific Lucene version | ||
*/ | ||
@AllArgsConstructor | ||
@Log4j2 | ||
public abstract class BasePerFieldKnnVectorsFormat extends PerFieldKnnVectorsFormat { | ||
|
||
private final Optional<MapperService> mapperService; | ||
private final int defaultMaxConnections; | ||
private final int defaultBeamWidth; | ||
private final Supplier<KnnVectorsFormat> defaultFormatSupplier; | ||
private final BiFunction<Integer, Integer, KnnVectorsFormat> formatSupplier; | ||
|
||
@Override | ||
public KnnVectorsFormat getKnnVectorsFormatForField(final String field) { | ||
if (isKnnVectorFieldType(field) == false) { | ||
log.debug( | ||
"Initialize KNN vector format for field [{}] with default params [max_connections] = \"{}\" and [beam_width] = \"{}\"", | ||
field, | ||
defaultMaxConnections, | ||
defaultBeamWidth | ||
); | ||
return defaultFormatSupplier.get(); | ||
} | ||
var type = (KNNVectorFieldMapper.KNNVectorFieldType) mapperService.orElseThrow( | ||
() -> new IllegalStateException( | ||
String.format("Cannot read field type for field [%s] because mapper service is not available", field) | ||
) | ||
).fieldType(field); | ||
var params = type.getKnnMethodContext().getMethodComponent().getParameters(); | ||
int maxConnections = getMaxConnections(params); | ||
int beamWidth = getBeamWidth(params); | ||
log.debug( | ||
"Initialize KNN vector format for field [{}] with params [max_connections] = \"{}\" and [beam_width] = \"{}\"", | ||
field, | ||
maxConnections, | ||
beamWidth | ||
); | ||
return formatSupplier.apply(maxConnections, beamWidth); | ||
} | ||
|
||
private boolean isKnnVectorFieldType(final String field) { | ||
return mapperService.isPresent() && mapperService.get().fieldType(field) instanceof KNNVectorFieldMapper.KNNVectorFieldType; | ||
} | ||
|
||
private int getMaxConnections(final Map<String, Object> params) { | ||
if (params != null && params.containsKey(KNNConstants.METHOD_PARAMETER_M)) { | ||
return (int) params.get(KNNConstants.METHOD_PARAMETER_M); | ||
} | ||
return defaultMaxConnections; | ||
} | ||
|
||
private int getBeamWidth(final Map<String, Object> params) { | ||
if (params != null && params.containsKey(KNNConstants.METHOD_PARAMETER_EF_CONSTRUCTION)) { | ||
return (int) params.get(KNNConstants.METHOD_PARAMETER_EF_CONSTRUCTION); | ||
} | ||
return defaultBeamWidth; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens when we start upgrading from 2.4 to 2.5 or 3.x?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we'll need to disable this test for higher versions similarly to what we're doing for some other IT, this will work for cases when previous version doesn't have filtering and next does have it