Skip to content

Commit

Permalink
Merge pull request #202 from stephenleo/master
Browse files Browse the repository at this point in the history
Adds Open Distro Elastic Search's KNN plugin support. Closes #174.
  • Loading branch information
erikbern authored Dec 15, 2020
2 parents 7aead5a + eb97504 commit f126b20
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 1 deletion.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ env:
- LIBRARY=scann DATASET=random-xs-20-angular
- LIBRARY=elasticsearch DATASET=random-xs-20-angular
- LIBRARY=elastiknn DATASET=random-xs-20-angular
- LIBRARY=opendistroknn DATASET=random-xs-20-angular

before_install:
- pip install -r requirements.txt
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Evaluated
* [N2](https://github.com/kakao/n2)
* [ScaNN](https://github.com/google-research/google-research/tree/master/scann)
* [Elastiknn](https://github.com/alexklibisz/elastiknn)
* [OpenDistro Elasticsearch KNN](https://github.com/opendistro-for-elasticsearch/k-NN)

Data sets
=========
Expand Down
69 changes: 68 additions & 1 deletion algos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ float:
- {"M": 96, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]


flann:
docker-tag: ann-benchmarks-flann
module: ann_benchmarks.algorithms.flann
Expand Down Expand Up @@ -474,6 +473,40 @@ float:
query-args:
- [1000, 10000]
- [0, 6]
opendistroknn:
docker-tag: ann-benchmarks-opendistroknn
module: ann_benchmarks.algorithms.opendistroknn
constructor: OpenDistroKNN
base-args: ["@metric", "@dimension"]
run-groups:
M-4:
arg-groups:
- {"M": 4, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-8:
arg-groups:
- {"M": 8, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-12:
arg-groups:
- {"M": 12, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-16:
arg-groups:
- {"M": 16, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-24:
arg-groups:
- {"M": 24, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-36:
arg-groups:
- {"M": 36, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-48:
arg-groups:
- {"M": 48, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]

angular:
puffinn:
Expand Down Expand Up @@ -613,6 +646,40 @@ float:
run-groups:
empty:
args: []
opendistroknn:
docker-tag: ann-benchmarks-opendistroknn
module: ann_benchmarks.algorithms.opendistroknn
constructor: OpenDistroKNN
base-args: ["@metric", "@dimension"]
run-groups:
M-4:
arg-groups:
- {"M": 4, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-8:
arg-groups:
- {"M": 8, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-12:
arg-groups:
- {"M": 12, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-16:
arg-groups:
- {"M": 16, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-24:
arg-groups:
- {"M": 24, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-36:
arg-groups:
- {"M": 36, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
M-48:
arg-groups:
- {"M": 48, "efConstruction": 500}
query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]

bit:
hamming:
Expand Down
98 changes: 98 additions & 0 deletions ann_benchmarks/algorithms/opendistroknn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import logging
from time import sleep
from urllib.error import URLError
from urllib.request import Request, urlopen

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

from ann_benchmarks.algorithms.base import BaseANN

from .elasticsearch import es_wait

# Configure the logger.
logging.getLogger("elasticsearch").setLevel(logging.WARN)

class OpenDistroKNN(BaseANN):
def __init__(self, metric, dimension, method_param):
self.metric = {"angular": "cosinesimil", "euclidean": "l2"}[metric]
self.dimension = dimension
self.method_param = method_param
self.param_string = "-".join(k+"-"+str(v) for k,v in self.method_param.items()).lower()
self.name = f"od-{self.param_string}"
self.es = Elasticsearch(["http://localhost:9200"])
es_wait()

def fit(self, X):
body = {
"settings": {
"index": {
"knn": True,
"knn.space_type": self.metric,
"knn.algo_param.ef_construction": self.method_param["efConstruction"],
"knn.algo_param.m": self.method_param["M"]
},
"number_of_shards": 1,
"number_of_replicas": 0,
"refresh_interval": -1
}
}

mapping = {
"properties": {
"id": {"type": "keyword", "store": True},
"vec": {"type": "knn_vector", "dimension": self.dimension}
}
}

self.es.indices.create(self.name, body=body)
self.es.indices.put_mapping(mapping, self.name)

print("Uploading data to the Index:", self.name)
def gen():
for i, vec in enumerate(X):
yield { "_op_type": "index", "_index": self.name, "vec": vec.tolist(), 'id': str(i + 1) }

(_, errors) = bulk(self.es, gen(), chunk_size=500, max_retries=9, request_timeout=10)
assert len(errors) == 0, errors

print("Force Merge...")
self.es.indices.forcemerge(self.name, max_num_segments=1, request_timeout=1000)

print("Refreshing the Index...")
self.es.indices.refresh(self.name, request_timeout=1000)

print("Running Warmup API...")
res = urlopen(Request("http://localhost:9200/_opendistro/_knn/warmup/"+self.name+"?pretty"))
print(res.read().decode("utf-8"))

def set_query_arguments(self, ef):
body = {
"settings": {
"index": {"knn.algo_param.ef_search": ef}
}
}
self.es.indices.put_settings(body=body)

def query(self, q, n):
body = {
"query": {
"knn": {
"vec": {"vector": q.tolist(), "k": n}
}
}
}

res = self.es.search(index=self.name, body=body, size=n, _source=False, docvalue_fields=['id'],
stored_fields="_none_", filter_path=["hits.hits.fields.id"], request_timeout=10)

return [int(h['fields']['id'][0]) - 1 for h in res['hits']['hits']]

def batch_query(self, X, n):
self.batch_res = [self.query(q, n) for q in X]

def get_batch_results(self):
return self.batch_res

def freeIndex(self):
self.es.indices.delete(index=self.name)
55 changes: 55 additions & 0 deletions install/Dockerfile.opendistroknn
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Warning! Do not use this config in production! This is only for testing and security has been turned off.

FROM ann-benchmarks

WORKDIR /home/app

# Install Open Distro following instructions from https://opendistro.github.io/for-elasticsearch-docs/docs/install/deb/
RUN apt-get install software-properties-common -y
RUN add-apt-repository ppa:openjdk-r/ppa \
&& apt update \
&& apt install openjdk-11-jdk -y
RUN apt install unzip -y \
&& apt-get install wget -y
RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add -
RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a /etc/apt/sources.list.d/opendistroforelasticsearch.list
RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.1-amd64.deb \
&& dpkg -i elasticsearch-oss-7.9.1-amd64.deb
RUN apt-get update \
&& apt install opendistroforelasticsearch -y

# Install python client.
RUN python3 -m pip install --upgrade elasticsearch==7.9.1

# Configure elasticsearch and JVM for single-node, single-core.
RUN echo '\
opendistro_security.disabled: true\n\
discovery.type: single-node\n\
network.host: 0.0.0.0\n\
node.master: true\n\
node.data: true\n\
node.processors: 1\n\
thread_pool.write.size: 1\n\
thread_pool.search.size: 1\n\
thread_pool.search.queue_size: 1\n\
path.data: /var/lib/elasticsearch\n\
path.logs: /var/log/elasticsearch\n\
' > /etc/elasticsearch/elasticsearch.yml

RUN echo '\
-Xms3G\n\
-Xmx3G\n\
-XX:+UseG1GC\n\
-XX:G1ReservePercent=25\n\
-XX:InitiatingHeapOccupancyPercent=30\n\
-XX:+HeapDumpOnOutOfMemoryError\n\
-XX:HeapDumpPath=/var/lib/elasticsearch\n\
-XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log\n\
-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m' > /etc/elasticsearch/jvm.options

# Make sure you can start the service.
RUN service elasticsearch start && service elasticsearch stop

# Custom entrypoint that also starts the Elasticsearch server.
RUN echo 'service elasticsearch start && python3 -u run_algorithm.py "$@"' > entrypoint.sh
ENTRYPOINT ["/bin/bash", "/home/app/entrypoint.sh"]

0 comments on commit f126b20

Please sign in to comment.