-
Notifications
You must be signed in to change notification settings - Fork 128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add DocValues Support for Lucene Byte Sized Vector #953
Changes from all commits
3390c98
70c2e03
45ce916
51699fe
1b77b14
0eb0adf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,17 +45,20 @@ | |
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.function.Supplier; | ||
|
||
import static org.opensearch.knn.common.KNNConstants.DEFAULT_VECTOR_DATA_TYPE_FIELD; | ||
import static org.opensearch.knn.common.KNNConstants.KNN_METHOD; | ||
import static org.opensearch.knn.common.KNNConstants.VECTOR_DATA_TYPE_FIELD; | ||
import static org.opensearch.knn.index.KNNSettings.KNN_INDEX; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do changes in this file related to adding docvalues support for lucene byte sized vectors? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, here we are trying to add some extra validation checks wrt knnIndex setting and knn engine. Also, to ingest doc values for byte vectors using script scoring |
||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.validateVectorDataTypeWithEngine; | ||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.validateVectorDataTypeWithKnnIndexSetting; | ||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.addStoredFieldForVectorField; | ||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.validateByteVectorValue; | ||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.validateFloatVectorValue; | ||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.validateVectorDataTypeWithEngine; | ||
import static org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil.validateVectorDimension; | ||
|
||
/** | ||
|
@@ -241,6 +244,12 @@ public KNNVectorFieldMapper build(BuilderContext context) { | |
.build(); | ||
return new LuceneFieldMapper(createLuceneFieldMapperInput); | ||
} | ||
|
||
// Validates and throws exception if data_type field is set in the index mapping | ||
// using any VectorDataType (other than float, which is default) because other | ||
// VectorDataTypes are only supported for lucene engine. | ||
validateVectorDataTypeWithEngine(vectorDataType); | ||
|
||
return new MethodFieldMapper( | ||
name, | ||
mappedFieldType, | ||
|
@@ -286,9 +295,14 @@ public KNNVectorFieldMapper build(BuilderContext context) { | |
this.efConstruction = LegacyFieldMapper.getEfConstruction(context.indexSettings()); | ||
} | ||
|
||
// Validates and throws exception if index.knn is set to true in the index settings | ||
// using any VectorDataType (other than float, which is default) because we are using NMSLIB engine for LegacyFieldMapper | ||
// and it only supports float VectorDataType | ||
validateVectorDataTypeWithKnnIndexSetting(context.indexSettings().getAsBoolean(KNN_INDEX, false), vectorDataType); | ||
|
||
return new LegacyFieldMapper( | ||
name, | ||
new KNNVectorFieldType(buildFullName(context), metaValue, dimension.getValue()), | ||
new KNNVectorFieldType(buildFullName(context), metaValue, dimension.getValue(), vectorDataType.getValue()), | ||
multiFieldsBuilder, | ||
copyToBuilder, | ||
ignoreMalformed, | ||
|
@@ -348,10 +362,6 @@ public Mapper.Builder<?> parse(String name, Map<String, Object> node, ParserCont | |
throw new IllegalArgumentException(String.format("Dimension value missing for vector: %s", name)); | ||
} | ||
|
||
// Validates and throws exception if data_type field is set in the index mapping | ||
// using any VectorDataType (other than float, which is default) with any engine (except lucene). | ||
validateVectorDataTypeWithEngine(builder.knnMethodContext, builder.vectorDataType); | ||
|
||
return builder; | ||
} | ||
} | ||
|
@@ -363,8 +373,8 @@ public static class KNNVectorFieldType extends MappedFieldType { | |
KNNMethodContext knnMethodContext; | ||
VectorDataType vectorDataType; | ||
|
||
public KNNVectorFieldType(String name, Map<String, String> meta, int dimension) { | ||
this(name, meta, dimension, null, null, DEFAULT_VECTOR_DATA_TYPE_FIELD); | ||
public KNNVectorFieldType(String name, Map<String, String> meta, int dimension, VectorDataType vectorDataType) { | ||
this(name, meta, dimension, null, null, vectorDataType); | ||
} | ||
|
||
public KNNVectorFieldType(String name, Map<String, String> meta, int dimension, KNNMethodContext knnMethodContext) { | ||
|
@@ -426,7 +436,7 @@ public Query termQuery(Object value, QueryShardContext context) { | |
@Override | ||
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, Supplier<SearchLookup> searchLookup) { | ||
failIfNoDocValues(); | ||
return new KNNVectorIndexFieldData.Builder(name(), CoreValuesSourceType.BYTES); | ||
return new KNNVectorIndexFieldData.Builder(name(), CoreValuesSourceType.BYTES, this.vectorDataType); | ||
} | ||
} | ||
|
||
|
@@ -480,16 +490,34 @@ protected void parseCreateField(ParseContext context, int dimension) throws IOEx | |
validateIfKNNPluginEnabled(); | ||
validateIfCircuitBreakerIsNotTriggered(); | ||
|
||
Optional<float[]> arrayOptional = getFloatsFromContext(context, dimension); | ||
if (VectorDataType.BYTE == vectorDataType) { | ||
Optional<byte[]> bytesArrayOptional = getBytesFromContext(context, dimension); | ||
|
||
if (!arrayOptional.isPresent()) { | ||
return; | ||
if (!bytesArrayOptional.isPresent()) { | ||
return; | ||
} | ||
final byte[] array = bytesArrayOptional.get(); | ||
VectorField point = new VectorField(name(), array, fieldType); | ||
|
||
context.doc().add(point); | ||
addStoredFieldForVectorField(context, fieldType, name(), point.toString()); | ||
} else if (VectorDataType.FLOAT == vectorDataType) { | ||
Optional<float[]> floatsArrayOptional = getFloatsFromContext(context, dimension); | ||
|
||
if (!floatsArrayOptional.isPresent()) { | ||
return; | ||
} | ||
final float[] array = floatsArrayOptional.get(); | ||
VectorField point = new VectorField(name(), array, fieldType); | ||
|
||
context.doc().add(point); | ||
addStoredFieldForVectorField(context, fieldType, name(), point.toString()); | ||
} else { | ||
throw new IllegalArgumentException( | ||
String.format(Locale.ROOT, "Cannot parse context for unsupported values provided for field [%s]", VECTOR_DATA_TYPE_FIELD) | ||
); | ||
} | ||
final float[] array = arrayOptional.get(); | ||
VectorField point = new VectorField(name(), array, fieldType); | ||
|
||
context.doc().add(point); | ||
addStoredFieldForVectorField(context, fieldType, name(), point.toString()); | ||
context.path().remove(); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why use a float[] for byte type? Shouldnt it be an int[]?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are using this method for scripting functions to retrieve the vector from docValues to calculate the score. SO, it doesn't make any difference if we return it as int[] or float[]. If we return it as int[] then again we need to do some method overloading and add methods for the spacetype functions in ScoringUtils to accept int[].