diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 52eddfc45..c1fd7b5ad 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -54,6 +54,7 @@ jobs: - onng_ngt - opensearchknn - panng_ngt + - parlayann - pg_embedding - pgvector - pgvecto_rs diff --git a/ann_benchmarks/algorithms/parlayann/Dockerfile b/ann_benchmarks/algorithms/parlayann/Dockerfile new file mode 100644 index 000000000..97afa11bd --- /dev/null +++ b/ann_benchmarks/algorithms/parlayann/Dockerfile @@ -0,0 +1,20 @@ +FROM ann-benchmarks + +RUN apt update +RUN apt install -y software-properties-common +RUN add-apt-repository -y ppa:git-core/ppa +RUN apt update +RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 + +#RUN apt-get update +#RUN apt-get install -y g++ software-properties-common +RUN pip3 install pybind11 numpy + +ARG CACHEBUST=1 +RUN git clone -b annbench https://github.com/cmuparlay/ParlayANN.git +RUN cd ParlayANN && git submodule update --init --recursive +RUN cd ParlayANN/python && bash compile.sh +#RUN cd ParlayANN/python && pip install -e . +#RUN python3 -c 'import parlaypy' +ENV PYTHONPATH=$PYTHONPATH:/home/app/ParlayANN/python +WORKDIR /home/app diff --git a/ann_benchmarks/algorithms/parlayann/config.yml b/ann_benchmarks/algorithms/parlayann/config.yml new file mode 100644 index 000000000..fa6ad8508 --- /dev/null +++ b/ann_benchmarks/algorithms/parlayann/config.yml @@ -0,0 +1,43 @@ +float: + euclidean: + - base_args: ['@metric'] + constructor: ParlayANN + disabled: false + docker_tag: ann-benchmarks-parlayann + module: ann_benchmarks.algorithms.parlayann + name: parlayann + run_groups: + parlay_80: + args: [{alpha: 1.15, R: 80, L: 160, two_pass: True}] + query_args: [[{Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 300}, {Q: 400}, {Q: 600}, {Q: 800}]] + parlay_64: + args: [{alpha: 1.1, R: 64, L: 128, two_pass: True}] + query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 300}, {Q: 400}, {Q: 600}, {Q: 800}]] + parlay_40: + args: [{alpha: 1.08, R: 40, L: 80, two_pass: True}] + query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}]] + parlay_32_05: + args: [{alpha: 1.05, R: 32, L: 64, two_pass: True}] + query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}]] + angular: + - base_args: ['@metric'] + constructor: ParlayANN + disabled: false + docker_tag: ann-benchmarks-parlayann + module: ann_benchmarks.algorithms.parlayann + name: parlayann + run_groups: + parlay_130: + args: [{alpha: .85, R: 130, L: 260, two_pass: True}] + query_args: [[{Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 400}, {Q: 800}]] + parlay_100: + args: [{alpha: .85, R: 100, L: 200, two_pass: True}] + query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {limit: 25}, {limit: 30}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 13}, {Q: 14}, {Q: 15}, {Q: 16}, {Q: 17}, {Q: 18}, {Q: 20}, {Q: 22}, {Q: 25}, {Q: 30}, {Q: 40}, {Q: 50}, {Q: 60}, {Q: 80}, {Q: 100}, {Q: 125}, {Q: 150}, {Q: 200}, {Q: 400}, {Q: 800}]] + parlay_80: + args: [{alpha: .90, R: 80, L: 160, two_pass: True}] + query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {limit: 25}, {limit: 30}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 13}, {Q: 14}, {Q: 15}, {Q: 16}, {Q: 17}, {Q: 18}, {Q: 20}]] + parlay_50: + args: [{alpha: .95, R: 50, L: 100, two_pass: True}] + query_args: [[{limit: 10}, {limit: 11}, {limit: 12}, {limit: 13}, {limit: 14}, {limit: 15}, {limit: 16}, {limit: 18}, {limit: 20}, {limit: 22}, {limit: 25}, {limit: 30}, {Q: 10}, {Q: 11}, {Q: 12}, {Q: 14}, {Q: 16}, {Q: 18}, {Q: 20}]] + + diff --git a/ann_benchmarks/algorithms/parlayann/module.py b/ann_benchmarks/algorithms/parlayann/module.py new file mode 100644 index 000000000..32a64713c --- /dev/null +++ b/ann_benchmarks/algorithms/parlayann/module.py @@ -0,0 +1,84 @@ +from __future__ import absolute_import +import psutil +import os +import struct +import time +import numpy as np +import wrapper as pann + +from ..base.module import BaseANN + +class ParlayANN(BaseANN): + def __init__(self, metric, index_params): + self.name = "parlayann_(" + str(index_params) + ")" + self._index_params = index_params + self._metric = self.translate_dist_fn(metric) + + self.R = int(index_params.get("R", 50)) + self.L = int(index_params.get("L", 100)) + self.alpha = float(index_params.get("alpha", 1.15)) + self.two_pass = bool(index_params.get("two_pass", False)) + + def translate_dist_fn(self, metric): + if metric == 'euclidean': + return 'Euclidian' + elif metric == 'ip': + return 'mips' + elif metric == 'angular': + return 'mips' + else: + raise Exception('Invalid metric') + + def translate_dtype(self, dtype:str): + if dtype == 'float32': + return 'float' + else: + return dtype + + def fit(self, X): + def bin_to_float(binary): + return struct.unpack("!f", struct.pack("!I", int(binary, 2)))[0] + + print("Vamana: Starting Fit...") + index_dir = "indices" + + if not os.path.exists(index_dir): + os.makedirs(index_dir) + + data_path = os.path.join(index_dir, "base.bin") + save_path = os.path.join(index_dir, self.name) + print("parlayann: Index Stored At: " + save_path) + nb, dims = X.shape + shape = [ + np.float32(bin_to_float("{:032b}".format(nb))), + np.float32(bin_to_float("{:032b}".format(dims))), + ] + X = X.flatten() + X = np.insert(X, 0, shape) + X.tofile(data_path) + + if not os.path.exists(save_path): + print("parlayann: Creating Index") + start = time.time() + self.params = pann.build_vamana_index(self._metric, "float", data_path, save_path, + self.R, self.L, self.alpha, self.two_pass) + end = time.time() + print("Indexing time: ", end - start) + print(f"Wrote index to {save_path}") + self.index = pann.load_index(self._metric, "float", data_path, save_path) + print("Index loaded") + + def query(self, X, k): + return self.index.single_search(X, k, self.Q, True, self.limit) + + def batch_query(self, X, k): + print("running batch") + nq, dims = X.shape + self.res, self.distances = self.index.batch_search(X, k, self.Q, True, self.limit) + return self.res + + def set_query_arguments(self, query_args): + self.name = "parlayann_(" + str(self._index_params) + "," + str(query_args) + ")" + print(query_args) + self.limit = 1000 if query_args.get("limit") is None else query_args.get("limit") + self.Q = 10 if query_args.get("Q") is None else query_args.get("Q")