Skip to content
This repository has been archived by the owner on Aug 2, 2022. It is now read-only.

Support k-NN similarity functions in painless scripting #281

Merged
Merged
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ esplugin {
// zip file name and plugin name in ${elasticsearch.plugin.name} read by ES when plugin loading
description 'Open Distro for Elasticsearch KNN'
classname 'com.amazon.opendistroforelasticsearch.knn.plugin.KNNPlugin'
extendedPlugins = ['lang-painless']
}

tasks.named("integTest").configure {
Expand All @@ -107,6 +108,7 @@ task release(type: Copy, group: 'build') {
//****************************************************************************/
dependencies {
compileOnly "org.elasticsearch:elasticsearch:${es_version}"
compileOnly "org.elasticsearch.plugin:elasticsearch-scripting-painless-spi:${es_version}"
compile group: 'com.google.guava', name: 'failureaccess', version:'1.0.1'
compile group: 'com.google.guava', name: 'guava', version:'29.0-jre'
testImplementation "org.elasticsearch.test:framework:${es_version}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.amazon.opendistroforelasticsearch.knn.index;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.LeafReader;
import org.elasticsearch.index.fielddata.LeafFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;

import java.io.IOException;

public class KNNVectorDVLeafFieldData implements LeafFieldData {
jmazanec15 marked this conversation as resolved.
Show resolved Hide resolved

private final LeafReader reader;
private final String fieldName;

public KNNVectorDVLeafFieldData(LeafReader reader, String fieldName) {
this.reader = reader;
this.fieldName = fieldName;
}

@Override
public void close() {
// no-op
}

@Override
public long ramBytesUsed() {
return 0; // unknown
}

@Override
public ScriptDocValues<float[]> getScriptValues() {
try {
final BinaryDocValues values = reader.getBinaryDocValues(fieldName);
if (values == null) {
throw new IllegalStateException("Binary Doc values not enabled for the field " + fieldName
+ " Please ensure the field type is knn_vector in mappings for this field");
}
return new KNNVectorScriptDocValues(values);
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values for vector field!", e);
}
}

@Override
public SortedBinaryDocValues getBytesValues() {
throw new UnsupportedOperationException("knn vector field doesn't support sorting");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
Expand All @@ -41,13 +42,15 @@
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.QueryShardException;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.lookup.SearchLookup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;

import static com.amazon.opendistroforelasticsearch.knn.index.KNNSettings.INDEX_KNN_ALGO_PARAM_EF_CONSTRUCTION_SETTING;
import static com.amazon.opendistroforelasticsearch.knn.index.KNNSettings.INDEX_KNN_ALGO_PARAM_M_SETTING;
Expand Down Expand Up @@ -207,6 +210,12 @@ public Query termQuery(Object value, QueryShardContext context) {
public int getDimension() {
return dimension;
}

@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, Supplier<SearchLookup> searchLookup) {
jmazanec15 marked this conversation as resolved.
Show resolved Hide resolved
failIfNoDocValues();
return new KNNVectorIndexFieldData.Builder(name(), CoreValuesSourceType.BYTES);
}
}

protected Explicit<Boolean> ignoreMalformed;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package com.amazon.opendistroforelasticsearch.knn.index;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.SortField;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.MultiValueMode;
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import org.elasticsearch.search.sort.BucketedSort;
import org.elasticsearch.search.sort.SortOrder;

public class KNNVectorIndexFieldData implements IndexFieldData<KNNVectorDVLeafFieldData> {

private final String fieldName;
private final ValuesSourceType valuesSourceType;

public KNNVectorIndexFieldData(String fieldName, ValuesSourceType valuesSourceType) {
this.fieldName = fieldName;
this.valuesSourceType = valuesSourceType;
}

@Override
public String getFieldName() {
return fieldName;
}

@Override
public ValuesSourceType getValuesSourceType() {
return valuesSourceType;
}

@Override
public KNNVectorDVLeafFieldData load(LeafReaderContext context) {
return new KNNVectorDVLeafFieldData(context.reader(), fieldName);
}

@Override
public KNNVectorDVLeafFieldData loadDirect(LeafReaderContext context) {
return load(context);
}

@Override
public SortField sortField(Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested, boolean reverse) {
throw new UnsupportedOperationException("knn vector field doesn't support this operation");
}

@Override
public BucketedSort newBucketedSort(
BigArrays bigArrays, Object missingValue,
MultiValueMode sortMode, XFieldComparatorSource.Nested nested,
SortOrder sortOrder, DocValueFormat format, int bucketSize, BucketedSort.ExtraData extra) {
throw new UnsupportedOperationException("knn vector field doesn't support this operation");
}


public static class Builder implements IndexFieldData.Builder {

private final String name;
private final ValuesSourceType valuesSourceType;


public Builder(String name, ValuesSourceType valuesSourceType) {
this.name = name;
this.valuesSourceType = valuesSourceType;
}

@Override
public IndexFieldData<?> build(IndexFieldDataCache cache, CircuitBreakerService breakerService) {
return new KNNVectorIndexFieldData(name, valuesSourceType);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistroforelasticsearch.knn.index;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.fielddata.ScriptDocValues;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;

// This class is thread safe, since docExists is synchronized at an instance level
public final class KNNVectorScriptDocValues extends ScriptDocValues<float[]> {
jmazanec15 marked this conversation as resolved.
Show resolved Hide resolved

private final BinaryDocValues binaryDocValues;
private boolean docExists;

public KNNVectorScriptDocValues(BinaryDocValues binaryDocValues) {
this.binaryDocValues = binaryDocValues;
}

@Override
public void setNextDocId(int docId) throws IOException {
synchronized (this) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need synchronized here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added thread safe to prevent any race condition based on feedback, since docExists setting is happening in one place and reading is happening on another place. If there is not threat with race condition then definitely we can remove this. At the end, it comes to do we need to make this instance thread safe or not? What do you think?

Copy link
Member

@vamshin vamshin Dec 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Do you have pointers to any other base classes extending ScriptDocValues<> doing synchronization? My understanding is each search request should have its own instance and documents are iterated sequentially so synchronization should not be required. If you see any base class doing this we can definitely consider.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed synchronization.

if (binaryDocValues.advanceExact(docId)) {
docExists = true;
return;
}
docExists = false;
}
}

public synchronized float[] getValue() throws IOException {
if (!docExists) {
throw new IllegalArgumentException("no value found for the corresponding doc ID");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why throw IllegalArgumentException? It seems like it maybe should be an IllegalStateException.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack

}
BytesRef value = binaryDocValues.binaryValue();
ByteArrayInputStream byteStream = new ByteArrayInputStream(value.bytes, value.offset, value.length);
ObjectInputStream objectStream = new ObjectInputStream(byteStream);
try {
return (float[]) objectStream.readObject();
} catch (ClassNotFoundException e) {
throw new RuntimeException((e));
}
}

@Override
public int size() {
synchronized (this) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need synchronized here? Avoid synchronization in the places not needed as it could hamper performance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added so that docExists is synchronized. Since it is just one instruction, i think i can remove this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes please. I have not seen other places doing this. We can remove this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed

return docExists ? 1 : 0;
}
}

@Override
public float[] get(int i) {
throw new UnsupportedOperationException("knn vector does not support this operation");
}
}
Loading