From f99a418c5544c2e08e41c7d72a2168f983de8b63 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 26 Jul 2023 09:12:04 -0400 Subject: [PATCH] Simplify `bench/ann` scripts to Python based module (#1642) Closes https://github.com/rapidsai/raft/issues/1633 Authors: - Divye Gala (https://github.com/divyegala) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/raft/pull/1642 --- .../bench_ann_cuda-118_arch-x86_64.yaml | 1 + cpp/bench/ann/conf/glove-100-inner.json | 6 +- cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 2 + .../ann/src/raft/raft_ivf_flat_wrapper.h | 2 + cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h | 2 + dependencies.yaml | 1 + docs/source/ann_benchmarks_build.md | 48 +++ docs/source/ann_benchmarks_dataset.md | 47 +++ docs/source/ann_benchmarks_low_level.md | 146 ++++++++ docs/source/cuda_ann_benchmarks.md | 322 ------------------ docs/source/index.rst | 2 +- docs/source/raft_ann_benchmarks.md | 262 ++++++++++++++ scripts/ann-benchmarks/algos.yaml | 30 ++ scripts/ann-benchmarks/data_export.py | 59 ++++ scripts/ann-benchmarks/get_dataset.py | 91 +++++ scripts/ann-benchmarks/plot.py | 240 +++++++++++++ scripts/ann-benchmarks/run.py | 185 ++++++++++ thirdparty/LICENSES/LICENSE.ann-benchmark | 21 ++ 18 files changed, 1139 insertions(+), 328 deletions(-) create mode 100644 docs/source/ann_benchmarks_build.md create mode 100644 docs/source/ann_benchmarks_dataset.md create mode 100644 docs/source/ann_benchmarks_low_level.md delete mode 100644 docs/source/cuda_ann_benchmarks.md create mode 100644 docs/source/raft_ann_benchmarks.md create mode 100644 scripts/ann-benchmarks/algos.yaml create mode 100644 scripts/ann-benchmarks/data_export.py create mode 100644 scripts/ann-benchmarks/get_dataset.py create mode 100644 scripts/ann-benchmarks/plot.py create mode 100644 scripts/ann-benchmarks/run.py create mode 100644 thirdparty/LICENSES/LICENSE.ann-benchmark diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 74b966cc03..a982febeed 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -30,6 +30,7 @@ dependencies: - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 - libfaiss>=1.7.1 +- matplotlib - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 diff --git a/cpp/bench/ann/conf/glove-100-inner.json b/cpp/bench/ann/conf/glove-100-inner.json index d210aca654..5d0bbf970c 100644 --- a/cpp/bench/ann/conf/glove-100-inner.json +++ b/cpp/bench/ann/conf/glove-100-inner.json @@ -789,9 +789,5 @@ ], "search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10" - }, - - - ] - + }] } diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h index e898a13636..4efe808f3c 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -79,6 +79,8 @@ class RaftCagra : public ANN { void save(const std::string& file) const override; void load(const std::string&) override; + ~RaftCagra() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + private: raft::device_resources handle_; BuildParam index_params_; diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h index bdd688f29b..42fb9bd4a1 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -79,6 +79,8 @@ class RaftIvfFlatGpu : public ANN { void save(const std::string& file) const override; void load(const std::string&) override; + ~RaftIvfFlatGpu() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + private: raft::device_resources handle_; BuildParam index_params_; diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index 7d791e6d29..30bd5ab4d6 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -79,6 +79,8 @@ class RaftIvfPQ : public ANN { void save(const std::string& file) const override; void load(const std::string&) override; + ~RaftIvfPQ() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + private: raft::device_resources handle_; BuildParam index_params_; diff --git a/dependencies.yaml b/dependencies.yaml index 7466947ce6..331ecf43ec 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -169,6 +169,7 @@ dependencies: - h5py>=3.8.0 - libfaiss>=1.7.1 - faiss-proc=*=cuda + - matplotlib cudatoolkit: specific: diff --git a/docs/source/ann_benchmarks_build.md b/docs/source/ann_benchmarks_build.md new file mode 100644 index 0000000000..80730c5d68 --- /dev/null +++ b/docs/source/ann_benchmarks_build.md @@ -0,0 +1,48 @@ +### Dependencies + +CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. + +Please refer to the [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. + +In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include: +1. FAISS GPU >= 1.7.1 +2. Google Logging (GLog) +3. H5Py +4. HNSWLib +5. nlohmann_json +6. GGNN + +[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically. + +The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks: + +```bash +mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +conda activate raft_ann_benchmarks +``` + +The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`. + +### Compiling the Benchmarks + +After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms: +```bash +./build.sh bench-ann +``` + +You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`): +```bash +./build.sh bench-ann -n --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH +``` + +Available targets to use with `--limit-bench-ann` are: +- FAISS_IVF_FLAT_ANN_BENCH +- FAISS_IVF_PQ_ANN_BENCH +- FAISS_BFKNN_ANN_BENCH +- GGNN_ANN_BENCH +- HNSWLIB_ANN_BENCH +- RAFT_CAGRA_ANN_BENCH +- RAFT_IVF_PQ_ANN_BENCH +- RAFT_IVF_FLAT_ANN_BENCH + +By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported. \ No newline at end of file diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md new file mode 100644 index 0000000000..99a6bfbd3a --- /dev/null +++ b/docs/source/ann_benchmarks_dataset.md @@ -0,0 +1,47 @@ +# ANN Benchmarks Datasets + +A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation. + +The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively. +These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order. + +Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type. + +Commonly used datasets can be downloaded from two websites: +1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks). + + However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it: + ```bash + pip3 install numpy h5py + ``` + The usage of this script is: + ```bash + $ cpp/bench/ann/scripts/hdf5_to_fbin.py + usage: scripts/hdf5_to_fbin.py [-n] .hdf5 + -n: normalize base/query set + outputs: .base.fbin + .query.fbin + .groundtruth.neighbors.ibin + .groundtruth.distances.fbin + ``` + So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset. + + Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset. + +2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this: + ```bash + $ cpp/bench/ann/scripts/split_groundtruth.pl + usage: script/split_groundtruth.pl input output_prefix + ``` + Take Deep-1B dataset as an example: + ```bash + pushd + cd cpp/bench/ann + mkdir -p data/deep-1B && cd data/deep-1B + # download manually "Ground Truth" file of "Yandex DEEP" + # suppose the file name is deep_new_groundtruth.public.10K.bin + ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth + # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced + popd + ``` + Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. \ No newline at end of file diff --git a/docs/source/ann_benchmarks_low_level.md b/docs/source/ann_benchmarks_low_level.md new file mode 100644 index 0000000000..f95d01f66f --- /dev/null +++ b/docs/source/ann_benchmarks_low_level.md @@ -0,0 +1,146 @@ +### Low-level Scripts and Executables +#### End-to-end Example +An end-to-end example (run from the RAFT source code root directory): +```bash +# (1) prepare a dataset +pushd + +cd cpp/bench/ann +mkdir data && cd data +wget http://ann-benchmarks.com/glove-100-angular.hdf5 + +# option -n is used here to normalize vectors so cosine distance is converted +# to inner product; don't use -n for l2 distance +python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5 + +mkdir glove-100-inner +mv glove-100-angular.base.fbin glove-100-inner/base.fbin +mv glove-100-angular.query.fbin glove-100-inner/query.fbin +mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin +mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin +popd + +# (2) build index +./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json + +# (3) search +./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json + +# (4) evaluate result +pushd +cd cpp/bench/ann +./scripts/eval.pl \ + -o result.csv \ + data/glove-100-inner/groundtruth.neighbors.ibin \ + result/glove-100-inner/faiss_ivf_flat +popd + +# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool +``` + +##### Step 1: Prepare Dataset +[Instructions](ann_benchmarks_dataset.md) + + +##### Step 2: Build Index +An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk. + +To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections: +* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed. + - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset. +* `search_basic_param` section specifies basic parameters for searching: + - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching. + - `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs. +* `index` section specifies an array of configurations for index building and searching: + - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values. + - `file` is the file name of index. Building will save built index to this file, while searching will load this file. + - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files. + - if `multigpu` is specified, multiple GPUs will be used for index build and search. + - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept. + + +The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables: +```bash +$ ./cpp/build/*_ANN_BENCH -h +usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json + -b: build mode, will build index + -s: search mode, will search using built index + one and only one of -b and -s should be specified + -f: force overwriting existing output files + -i: by default will build/search all the indices found in conf.json + '-i' can be used to select a subset of indices + 'index_names' is a list of comma-separated index names + '*' is allowed as the last character of a name to select all matched indices + for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss" +``` +* `-b`: build index. +* `-s`: do the searching with built index. +* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option. +* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option. + +It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains: +```json + "index" : [ + { + "name" : "hnsw1", + ... + }, + { + "name" : "hnsw1", + ... + }, + { + "name" : "faiss", + ... + } + ] +``` +Then, +```bash +# build all indices: hnsw1, hnsw2 and faiss +./cpp/build/HNSWLIB_ANN_BENCH -b a.json + +# build only hnsw1 +./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json + +# build hnsw1 and hnsw2 +./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json + +# build hnsw1 and hnsw2 +./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json + +# build faiss +./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json +``` +In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell. + + +##### Step 3: Searching +Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2. + + +##### Step 4: Evaluating Results +Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is: +```bash +$ cpp/bench/ann/scripts/eval.pl +usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths... + result_paths... are paths to the search result files. + Can specify multiple paths. + For each of them, if it's a directory, all the .txt files found under + it recursively will be regarded as inputs. + + -f: force to recompute recall and update it in result file if needed + -o: also write result to a csv file +``` +Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files. +An example: +```bash +cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \ + result/glove-100-angular/10/hnsw/angular_M_24_*.txt \ + result/glove-100-angular/10/faiss/ +``` +The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively. + +This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively. + +It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o ` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves. diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md deleted file mode 100644 index c9547e7d77..0000000000 --- a/docs/source/cuda_ann_benchmarks.md +++ /dev/null @@ -1,322 +0,0 @@ -# CUDA ANN Benchmarks - -This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU. - -## Benchmark - -### Dependencies - -CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. - -Please refer to the [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. - -In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include: -1. FAISS GPU >= 1.7.1 -2. Google Logging (GLog) -3. H5Py -4. HNSWLib -5. nlohmann_json -6. GGNN - -[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically. - -The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks: - -```bash -mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml -conda activate raft_ann_benchmarks -``` - -The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`. - -### Compiling the Benchmarks - -After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms: -```bash -./build.sh bench-ann -``` - -You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`): -```bash -./build.sh bench-ann -n --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH -``` - -Available targets to use with `--limit-bench-ann` are: -- FAISS_IVF_FLAT_ANN_BENCH -- FAISS_IVF_PQ_ANN_BENCH -- FAISS_BFKNN_ANN_BENCH -- GGNN_ANN_BENCH -- HNSWLIB_ANN_BENCH -- RAFT_CAGRA_ANN_BENCH -- RAFT_IVF_PQ_ANN_BENCH -- RAFT_IVF_FLAT_ANN_BENCH - -By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported. - -### Usage -There are 4 general steps to running the benchmarks: -1. Prepare Dataset -2. Build Index -3. Search Using Built Index -4. Evaluate Result - -#### End-to-end Example -An end-to-end example (run from the RAFT source code root directory): -```bash -# (1) prepare a dataset -pushd - -cd cpp/bench/ann -mkdir data && cd data -wget http://ann-benchmarks.com/glove-100-angular.hdf5 - -# option -n is used here to normalize vectors so cosine distance is converted -# to inner product; don't use -n for l2 distance -python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5 - -mkdir glove-100-inner -mv glove-100-angular.base.fbin glove-100-inner/base.fbin -mv glove-100-angular.query.fbin glove-100-inner/query.fbin -mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin -mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin -popd - -# (2) build index -./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json - -# (3) search -./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json - -# (4) evaluate result -pushd -cd cpp/bench/ann -./scripts/eval.pl \ - -o result.csv \ - data/glove-100-inner/groundtruth.neighbors.ibin \ - result/glove-100-inner/faiss_ivf_flat -popd - -# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool -``` - -##### Step 1: Prepare Dataset -A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation. - -The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively. -These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order. - -Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type. - -Commonly used datasets can be downloaded from two websites: -1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks). - - However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it: - ```bash - pip3 install numpy h5py - ``` - The usage of this script is: - ```bash - $ cpp/bench/ann/scripts/hdf5_to_fbin.py - usage: scripts/hdf5_to_fbin.py [-n] .hdf5 - -n: normalize base/query set - outputs: .base.fbin - .query.fbin - .groundtruth.neighbors.ibin - .groundtruth.distances.fbin - ``` - So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset. - - Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset. - -2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this: - ```bash - $ cpp/bench/ann/scripts/split_groundtruth.pl - usage: script/split_groundtruth.pl input output_prefix - ``` - Take Deep-1B dataset as an example: - ```bash - pushd - cd cpp/bench/ann - mkdir -p data/deep-1B && cd data/deep-1B - # download manually "Ground Truth" file of "Yandex DEEP" - # suppose the file name is deep_new_groundtruth.public.10K.bin - ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth - # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced - popd - ``` - Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. - - -##### Step 2: Build Index -An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk. - -To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections: -* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed. - - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset. -* `search_basic_param` section specifies basic parameters for searching: - - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching. - - `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs. -* `index` section specifies an array of configurations for index building and searching: - - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values. - - `file` is the file name of index. Building will save built index to this file, while searching will load this file. - - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files. - - if `multigpu` is specified, multiple GPUs will be used for index build and search. - - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept. - - -The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables: -```bash -$ ./cpp/build/*_ANN_BENCH -h -usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json - -b: build mode, will build index - -s: search mode, will search using built index - one and only one of -b and -s should be specified - -f: force overwriting existing output files - -i: by default will build/search all the indices found in conf.json - '-i' can be used to select a subset of indices - 'index_names' is a list of comma-separated index names - '*' is allowed as the last character of a name to select all matched indices - for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss" -``` -* `-b`: build index. -* `-s`: do the searching with built index. -* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option. -* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option. - -It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains: -```json - "index" : [ - { - "name" : "hnsw1", - ... - }, - { - "name" : "hnsw1", - ... - }, - { - "name" : "faiss", - ... - } - ] -``` -Then, -```bash -# build all indices: hnsw1, hnsw2 and faiss -./cpp/build/HNSWLIB_ANN_BENCH -b a.json - -# build only hnsw1 -./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json - -# build hnsw1 and hnsw2 -./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json - -# build hnsw1 and hnsw2 -./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json - -# build faiss -./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json -``` -In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell. - - -##### Step 3: Searching -Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2. - - -##### Step 4: Evaluating Results -Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is: -```bash -$ cpp/bench/ann/scripts/eval.pl -usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths... - result_paths... are paths to the search result files. - Can specify multiple paths. - For each of them, if it's a directory, all the .txt files found under - it recursively will be regarded as inputs. - - -f: force to recompute recall and update it in result file if needed - -o: also write result to a csv file -``` -Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files. -An example: -```bash -cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \ - result/glove-100-angular/10/hnsw/angular_M_24_*.txt \ - result/glove-100-angular/10/faiss/ -``` -The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively. - -This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively. - -It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o ` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves. - -## Adding a new ANN algorithm -Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions. - -In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN::AnnSearchParam`. Take `class HnswLib` as an example, its definition is: -```c++ -template -class HnswLib : public ANN { -public: - struct BuildParam { - int M; - int ef_construction; - int num_threads; - }; - - using typename ANN::AnnSearchParam; - struct SearchParam : public AnnSearchParam { - int ef; - int num_threads; - }; - - // ... -}; -``` - -The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example: -```json -{ - "name" : "...", - "algo" : "hnswlib", - "build_param": {"M":12, "efConstruction":500, "numThreads":32}, - "file" : "/path/to/file", - "search_params" : [ - {"ef":10, "numThreads":1}, - {"ef":20, "numThreads":1}, - {"ef":40, "numThreads":1}, - ], - "search_result_file" : "/path/to/file" -}, -``` - -How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`: -1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively: - ```c++ - template - void parse_build_param(const nlohmann::json& conf, - typename cuann::HnswLib::BuildParam& param) { - param.ef_construction = conf.at("efConstruction"); - param.M = conf.at("M"); - if (conf.contains("numThreads")) { - param.num_threads = conf.at("numThreads"); - } - } - - template - void parse_search_param(const nlohmann::json& conf, - typename cuann::HnswLib::SearchParam& param) { - param.ef = conf.at("ef"); - if (conf.contains("numThreads")) { - param.num_threads = conf.at("numThreads"); - } - } - ``` - -2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example, - ```c++ - // JSON configuration file contains a line like: "algo" : "hnswlib" - if (algo == "hnswlib") { - // ... - } - ``` diff --git a/docs/source/index.rst b/docs/source/index.rst index 23e346c872..5bb42bce45 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -44,7 +44,7 @@ While not exhaustive, the following general categories help summarize the accele developer_guide.md cpp_api.rst pylibraft_api.rst - cuda_ann_benchmarks.md + raft_ann_benchmarks.md raft_dask_api.rst using_comms.rst using_libraft.md diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md new file mode 100644 index 0000000000..df0c21dd7b --- /dev/null +++ b/docs/source/raft_ann_benchmarks.md @@ -0,0 +1,262 @@ +# RAFT ANN Benchmarks + +This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU. + +## Installing the benchmarks + +The easiest way to install these benchmarks is through conda. We suggest using mamba as it generally leads to a faster install time:: +```bash +mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +conda activate raft_ann_benchmarks + +mamba install -c rapidsai libraft-ann-bench +``` +The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly benchmarks are desired. + +Please see the [build instructions](ann_benchmarks_build.md) to build the benchmarks from source. + +## Running the benchmarks + +### Usage +There are 4 general steps to running the benchmarks and vizualizing the results: +1. Prepare Dataset +2. Build Index and Search Index +3. Evaluate Results +4. Plot Results + +We provide a collection of lightweight Python scripts that are wrappers over +lower level scripts and executables to run our benchmarks. Either Python scripts or +[low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks, +however plots are only provided through our Python scripts. +### End-to-end example: Million-scale +```bash +# All scripts are present in directory raft/scripts/ann-benchmarks + +# (1) prepare dataset +python scripts/ann-benchmarks/get_dataset.py --name glove-100-angular --normalize + +# (2) build and search index +python scripts/ann-benchmarks/run.py --configuration conf/glove-100-inner.json + +# (3) evaluate results +python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/glove-100-inner/groundtruth.neighbors.ibin result/glove-100-inner/ + +# (4) plot results +python scripts/ann-benchmarks/plot.py --result_csv out.csv +``` + +### End-to-end example: Billion-scale +`scripts/get_dataset.py` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.html#billion-scale) +because they are so large. You should instead use our billion-scale datasets guide to download and prepare them. +All other python scripts mentioned below work as intended once the +billion-scale dataset has been downloaded. +To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html) + +```bash +mkdir -p data/deep-1B && cd data/deep-1B +# (1) prepare dataset +# download manually "Ground Truth" file of "Yandex DEEP" +# suppose the file name is deep_new_groundtruth.public.10K.bin +../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth +# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced + +# (2) build and search index +python scripts/run.py --configuration conf/deep-1B.json + +# (3) evaluate results +python scripts/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/ + +# (4) plot results +python scripts/plot.py --result_csv out.csv +``` + +##### Step 1: Prepare Dataset +The script `scripts/ann-benchmarks/get_dataset.py` will download and unpack the dataset in directory +that the user provides. As of now, only million-scale datasets are supported by this +script. For more information on [datasets and formats](ann_benchmarks_dataset.md). + +The usage of this script is: +```bash +usage: get_dataset.py [-h] [--name NAME] [--path PATH] [--normalize] + +options: + -h, --help show this help message and exit + --name NAME dataset to download (default: glove-100-angular) + --path PATH path to download dataset (default: {os.getcwd()}/data) + --normalize normalize cosine distance to inner product (default: False) +``` + +When option `normalize` is provided to the script, any dataset that has cosine distances +will be normalized to inner product. So, for example, the dataset `glove-100-angular` +will be written at location `data/glove-100-inner/`. + +#### Step 2: Build and Search Index +The script `scripts/ann-benchmarks/run.py` will build and search indices for a given dataset and its +specified configuration. +To confirgure which algorithms are available, we use `algos.yaml`. +To configure building/searching indices for a dataset, look at [index configuration](#json-index-config). +An entry in `algos.yaml` looks like: +```yaml +raft_ivf_pq: + executable: RAFT_IVF_PQ_ANN_BENCH + disabled: false +``` +`executable` : specifies the binary that will build/search the index. It is assumed to be +available in `raft/cpp/build/`. +`disabled` : denotes whether an algorithm should be excluded from benchmark runs. + +The usage of the script `scripts/run.py` is: +```bash +usage: run.py [-h] --configuration CONFIGURATION [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [--force] + +options: + -h, --help show this help message and exit + --configuration CONFIGURATION + path to configuration file for a dataset (default: None) + --build + --search + --algorithms ALGORITHMS + run only comma separated list of named algorithms (default: None) + --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) + --force re-run algorithms even if their results already exist (default: False) +``` + +`build` and `search` : if both parameters are not supplied to the script then +it is assumed both are `True`. + +`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index +is available in `algos.yaml` and not disabled, as well as having an associated executable. + +#### Step 3: Evaluating Results +The script `scripts/ann-benchmarks/data_export.py` will evaluate results for a dataset whose index has been built +and search with at least one algorithm. For every result file that is supplied to the script, the output +will be combined and written to a CSV file. + +The usage of this script is: +```bash +usage: data_export.py [-h] --output OUTPUT [--recompute] --groundtruth GROUNDTRUTH + +options: + -h, --help show this help message and exit + --output OUTPUT Path to the CSV output file (default: None) + --recompute Recompute metrics (default: False) + --groundtruth GROUNDTRUTH + Path to groundtruth.neighbors.ibin file for a dataset (default: None) +``` + +`result_filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example) + +#### Step 4: Plot Results +The script `scripts/ann-benchmarks/plot.py` will plot all results evaluated to a CSV file for a given dataset. + +The usage of this script is: +```bash +usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] + +options: + -h, --help show this help message and exit + --result_csv RESULT_CSV + Path to CSV Results (default: None) + --output OUTPUT Path to the PNG output file (default: /home/nfs/dgala/raft/out.png) + --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) + --y-scale {linear,log,symlog,logit} + Scale to use when drawing the Y-axis (default: linear) + --raw Show raw results (not just Pareto frontier) in faded colours (default: False) +``` + +All algorithms present in the CSV file supplied to this script with parameter `result_csv` +will appear in the plot. + +## Adding a new ANN algorithm +### Implementation and Configuration +Implementation of a new algorithm should be a C++ class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions. + +In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN::AnnSearchParam`. Take `class HnswLib` as an example, its definition is: +```c++ +template +class HnswLib : public ANN { +public: + struct BuildParam { + int M; + int ef_construction; + int num_threads; + }; + + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + int ef; + int num_threads; + }; + + // ... +}; +``` + +The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example: +```json +{ + "name" : "...", + "algo" : "hnswlib", + "build_param": {"M":12, "efConstruction":500, "numThreads":32}, + "file" : "/path/to/file", + "search_params" : [ + {"ef":10, "numThreads":1}, + {"ef":20, "numThreads":1}, + {"ef":40, "numThreads":1}, + ], + "search_result_file" : "/path/to/file" +}, +``` + +How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`: +1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively: + ```c++ + template + void parse_build_param(const nlohmann::json& conf, + typename cuann::HnswLib::BuildParam& param) { + param.ef_construction = conf.at("efConstruction"); + param.M = conf.at("M"); + if (conf.contains("numThreads")) { + param.num_threads = conf.at("numThreads"); + } + } + + template + void parse_search_param(const nlohmann::json& conf, + typename cuann::HnswLib::SearchParam& param) { + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { + param.num_threads = conf.at("numThreads"); + } + } + ``` + +2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example, + ```c++ + // JSON configuration file contains a line like: "algo" : "hnswlib" + if (algo == "hnswlib") { + // ... + } + ``` + +### Adding a CMake Target +In `raft/cpp/bench/ann/CMakeLists.txt`, we provide a `CMake` function to configure a new Benchmark target with the following signature: +``` +ConfigureAnnBench( + NAME + PATH + INCLUDES + CXXFLAGS + LINKS +) +``` + +To add a target for `HNSWLIB`, we would call the function as: +``` +ConfigureAnnBench( + NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES + ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS "${HNSW_CXX_FLAGS}" +) +``` + +This will create an executable called `HNSWLIB_ANN_BENCH`, which can then be used to run `HNSWLIB` benchmarks. diff --git a/scripts/ann-benchmarks/algos.yaml b/scripts/ann-benchmarks/algos.yaml new file mode 100644 index 0000000000..54fddf607b --- /dev/null +++ b/scripts/ann-benchmarks/algos.yaml @@ -0,0 +1,30 @@ +faise_gpu_ivf_flat: + executable: FAISS_IVF_FLAT_ANN_BENCH + disabled: false +faiss_gpu_flat: + executable: FAISS_IVF_FLAT_ANN_BENCH + disabled: false +faiss_gpu_ivf_pq: + executable: FAISS_IVF_PQ_ANN_BENCH + disabled: false +faiss_gpu_ivf_sq: + executable: FAISS_IVF_PQ_ANN_BENCH + disabled: false +faiss_gpu_bfknn: + executable: FAISS_BFKNN_ANN_BENCH + disabled: false +raft_ivf_flat: + executable: RAFT_IVF_FLAT_ANN_BENCH + disabled: false +raft_ivf_pq: + executable: RAFT_IVF_PQ_ANN_BENCH + disabled: false +raft_cagra: + executable: RAFT_CAGRA_ANN_BENCH + disabled: false +ggnn: + executable: GGNN_ANN_BENCH + disabled: false +hnswlib: + executable: HNSWLIB_ANN_BENCH + disabled: false \ No newline at end of file diff --git a/scripts/ann-benchmarks/data_export.py b/scripts/ann-benchmarks/data_export.py new file mode 100644 index 0000000000..18c6a1a4d2 --- /dev/null +++ b/scripts/ann-benchmarks/data_export.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess + + +def export_results(output_filepath, recompute, groundtruth_filepath, + result_filepaths): + print(f"Writing output file to: {output_filepath}") + ann_bench_scripts_dir = "cpp/bench/ann/scripts" + ann_bench_scripts_path = os.path.join(os.getcwd(), + ann_bench_scripts_dir, + "eval.pl") + if recompute: + p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath, + groundtruth_filepath] + result_filepaths) + else: + p = subprocess.Popen([ann_bench_scripts_path, "-o", output_filepath, + groundtruth_filepath] + result_filepaths) + p.wait() + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--output", help="Path to the CSV output file", + required=True) + parser.add_argument("--recompute", action="store_true", + help="Recompute metrics") + parser.add_argument("--groundtruth", + help="Path to groundtruth.neighbors.ibin file for a dataset", + required=True) + args, result_filepaths = parser.parse_known_args() + + # if nothing is provided + if len(result_filepaths) == 0: + raise ValueError("No filepaths to results were provided") + + groundtruth_filepath = args.groundtruth + export_results(args.output, args.recompute, groundtruth_filepath, + result_filepaths) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/get_dataset.py b/scripts/ann-benchmarks/get_dataset.py new file mode 100644 index 0000000000..c071296125 --- /dev/null +++ b/scripts/ann-benchmarks/get_dataset.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess +from urllib.request import urlretrieve + + +def get_dataset_path(name, ann_bench_data_path): + if not os.path.exists(ann_bench_data_path): + os.mkdir(ann_bench_data_path) + return os.path.join(ann_bench_data_path, f"{name}.hdf5") + + +def download_dataset(url, path): + if not os.path.exists(path): + print(f"downloading {url} -> {path}...") + urlretrieve(url, path) + + +def convert_hdf5_to_fbin(path, normalize): + ann_bench_scripts_dir = "cpp/bench/ann/scripts" + ann_bench_scripts_path = os.path.join(os.getcwd(), ann_bench_scripts_dir, + "hdf5_to_fbin.py") + if normalize and "angular" in path: + p = subprocess.Popen(["python", ann_bench_scripts_path, "-n", + "%s" % path]) + else: + p = subprocess.Popen(["python", ann_bench_scripts_path, + "%s" % path]) + p.wait() + + +def move(name, ann_bench_data_path): + if "angular" in name: + new_name = name.replace("angular", "inner") + else: + new_name = name + new_path = os.path.join(ann_bench_data_path, new_name) + if not os.path.exists(new_path): + os.mkdir(new_path) + for bin_name in ["base.fbin", "query.fbin", "groundtruth.neighbors.ibin", + "groundtruth.distances.fbin"]: + os.rename(f"{ann_bench_data_path}/{name}.{bin_name}", + f"{new_path}/{bin_name}") + + +def download(name, normalize, ann_bench_data_path): + path = get_dataset_path(name, ann_bench_data_path) + try: + url = f"http://ann-benchmarks.com/{name}.hdf5" + download_dataset(url, path) + + convert_hdf5_to_fbin(path, normalize) + + move(name, ann_bench_data_path) + except Exception: + print(f"Cannot download {url}") + raise + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--name", help="dataset to download", + default="glove-100-angular") + parser.add_argument("--path", help="path to download dataset", + default=os.path.join(os.getcwd(), "data")) + parser.add_argument("--normalize", + help="normalize cosine distance to inner product", + action="store_true") + args = parser.parse_args() + + download(args.name, args.normalize, args.path) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/plot.py b/scripts/ann-benchmarks/plot.py new file mode 100644 index 0000000000..772bdf8738 --- /dev/null +++ b/scripts/ann-benchmarks/plot.py @@ -0,0 +1,240 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is inspired by +# 1: https://github.com/erikbern/ann-benchmarks/blob/main/plot.py +# 2: https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/plotting/utils.py +# 3: https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/plotting/metrics.py +# Licence: https://github.com/erikbern/ann-benchmarks/blob/main/LICENSE + +import matplotlib as mpl + +mpl.use("Agg") # noqa +import argparse +import itertools +import matplotlib.pyplot as plt +import numpy as np +import os + + + +metrics = { + "k-nn": { + "description": "Recall", + "worst": float("-inf"), + "lim": [0.0, 1.03], + }, + "qps": { + "description": "Queries per second (1/s)", + "worst": float("-inf"), + } +} + + +def generate_n_colors(n): + vs = np.linspace(0.3, 0.9, 7) + colors = [(0.9, 0.4, 0.4, 1.0)] + + def euclidean(a, b): + return sum((x - y) ** 2 for x, y in zip(a, b)) + + while len(colors) < n: + new_color = max(itertools.product(vs, vs, vs), key=lambda a: min(euclidean(a, b) for b in colors)) + colors.append(new_color + (1.0,)) + return colors + + +def create_linestyles(unique_algorithms): + colors = dict(zip(unique_algorithms, generate_n_colors(len(unique_algorithms)))) + linestyles = dict((algo, ["--", "-.", "-", ":"][i % 4]) for i, algo in enumerate(unique_algorithms)) + markerstyles = dict((algo, ["+", "<", "o", "*", "x"][i % 5]) for i, algo in enumerate(unique_algorithms)) + faded = dict((algo, (r, g, b, 0.3)) for algo, (r, g, b, a) in colors.items()) + return dict((algo, (colors[algo], faded[algo], linestyles[algo], markerstyles[algo])) for algo in unique_algorithms) + + +def get_up_down(metric): + if metric["worst"] == float("inf"): + return "down" + return "up" + + +def get_left_right(metric): + if metric["worst"] == float("inf"): + return "left" + return "right" + + +def get_plot_label(xm, ym): + template = "%(xlabel)s-%(ylabel)s tradeoff - %(updown)s and" " to the %(leftright)s is better" + return template % { + "xlabel": xm["description"], + "ylabel": ym["description"], + "updown": get_up_down(ym), + "leftright": get_left_right(xm), + } + + +def create_pointset(data, xn, yn): + xm, ym = (metrics[xn], metrics[yn]) + rev_y = -1 if ym["worst"] < 0 else 1 + rev_x = -1 if xm["worst"] < 0 else 1 + data.sort(key=lambda t: (rev_y * t[-1], rev_x * t[-2])) + + axs, ays, als = [], [], [] + # Generate Pareto frontier + xs, ys, ls = [], [], [] + last_x = xm["worst"] + comparator = (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx) + for algo_name, xv, yv in data: + if not xv or not yv: + continue + axs.append(xv) + ays.append(yv) + als.append(algo_name) + if comparator(xv, last_x): + last_x = xv + xs.append(xv) + ys.append(yv) + ls.append(algo_name) + return xs, ys, ls, axs, ays, als + + +def create_plot(all_data, raw, x_scale, y_scale, fn_out, linestyles): + xn = "k-nn" + yn = "qps" + xm, ym = (metrics[xn], metrics[yn]) + # Now generate each plot + handles = [] + labels = [] + plt.figure(figsize=(12, 9)) + + # Sorting by mean y-value helps aligning plots with labels + def mean_y(algo): + xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) + return -np.log(np.array(ys)).mean() + + # Find range for logit x-scale + min_x, max_x = 1, 0 + for algo in sorted(all_data.keys(), key=mean_y): + xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) + min_x = min([min_x] + [x for x in xs if x > 0]) + max_x = max([max_x] + [x for x in xs if x < 1]) + color, faded, linestyle, marker = linestyles[algo] + (handle,) = plt.plot( + xs, ys, "-", label=algo, color=color, ms=7, mew=3, lw=3, marker=marker + ) + handles.append(handle) + if raw: + (handle2,) = plt.plot( + axs, ays, "-", label=algo, color=faded, ms=5, mew=2, lw=2, marker=marker + ) + labels.append(algo) + + ax = plt.gca() + ax.set_ylabel(ym["description"]) + ax.set_xlabel(xm["description"]) + # Custom scales of the type --x-scale a3 + if x_scale[0] == "a": + alpha = float(x_scale[1:]) + + def fun(x): + return 1 - (1 - x) ** (1 / alpha) + + def inv_fun(x): + return 1 - (1 - x) ** alpha + + ax.set_xscale("function", functions=(fun, inv_fun)) + if alpha <= 3: + ticks = [inv_fun(x) for x in np.arange(0, 1.2, 0.2)] + plt.xticks(ticks) + if alpha > 3: + from matplotlib import ticker + + ax.xaxis.set_major_formatter(ticker.LogitFormatter()) + # plt.xticks(ticker.LogitLocator().tick_values(min_x, max_x)) + plt.xticks([0, 1 / 2, 1 - 1e-1, 1 - 1e-2, 1 - 1e-3, 1 - 1e-4, 1]) + # Other x-scales + else: + ax.set_xscale(x_scale) + ax.set_yscale(y_scale) + ax.set_title(get_plot_label(xm, ym)) + plt.gca().get_position() + # plt.gca().set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(handles, labels, loc="center left", bbox_to_anchor=(1, 0.5), prop={"size": 9}) + plt.grid(visible=True, which="major", color="0.65", linestyle="-") + plt.setp(ax.get_xminorticklabels(), visible=True) + + # Logit scale has to be a subset of (0,1) + if "lim" in xm and x_scale != "logit": + x0, x1 = xm["lim"] + plt.xlim(max(x0, 0), min(x1, 1)) + elif x_scale == "logit": + plt.xlim(min_x, max_x) + if "lim" in ym: + plt.ylim(ym["lim"]) + + # Workaround for bug https://github.com/matplotlib/matplotlib/issues/6789 + ax.spines["bottom"]._adjust_location() + + plt.savefig(fn_out, bbox_inches="tight") + plt.close() + + +def load_all_results(result_filepath): + results = dict() + with open(result_filepath, 'r') as f: + for line in f.readlines()[1:]: + split_lines = line.split(',') + algo_name = split_lines[0].split('.')[0] + if algo_name not in results: + results[algo_name] = [] + results[algo_name].append([algo_name, float(split_lines[1]), + float(split_lines[2])]) + return results + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--result_csv", help="Path to CSV Results", required=True) + parser.add_argument("--output", help="Path to the PNG output file", + default=f"{os.getcwd()}/out.png") + parser.add_argument( + "--x-scale", + help="Scale to use when drawing the X-axis. \ + Typically linear, logit or a2", + default="linear" + ) + parser.add_argument( + "--y-scale", + help="Scale to use when drawing the Y-axis", + choices=["linear", "log", "symlog", "logit"], + default="linear", + ) + parser.add_argument( + "--raw", help="Show raw results (not just Pareto frontier) in faded colours", action="store_true" + ) + args = parser.parse_args() + + print(f"writing output to {args.output}") + + results = load_all_results(args.result_csv) + linestyles = create_linestyles(sorted(results.keys())) + + create_plot(results, args.raw, args.x_scale, args.y_scale, args.output, linestyles) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/run.py b/scripts/ann-benchmarks/run.py new file mode 100644 index 0000000000..5a2be49af6 --- /dev/null +++ b/scripts/ann-benchmarks/run.py @@ -0,0 +1,185 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +import subprocess +import yaml + + +def validate_algorithm(algos_conf, algo): + algos_conf_keys = set(algos_conf.keys()) + return algo in algos_conf_keys and not algos_conf[algo]["disabled"] + + +def find_executable(algos_conf, algo): + executable = algos_conf[algo]["executable"] + conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann", + executable) + build_path = os.path.join(os.getcwd(), "cpp", "build", executable) + if os.path.exists(conda_path): + return (executable, conda_path) + elif os.path.exists(build_path): + return (executable, build_path) + else: + raise FileNotFoundError(executable) + + +def run_build_and_search(conf_filename, conf_file, executables_to_run, + force, ann_bench_path, build, search): + for executable, ann_executable_path in executables_to_run.keys(): + # Need to write temporary configuration + temp_conf_filename = f"temporary_executable_{conf_filename}" + temp_conf_filepath = os.path.join(ann_bench_path, "conf", + temp_conf_filename) + with open(temp_conf_filepath, "w") as f: + temp_conf = dict() + temp_conf["dataset"] = conf_file["dataset"] + temp_conf["search_basic_param"] = conf_file["search_basic_param"] + temp_conf["index"] = executables_to_run[(executable, + ann_executable_path)]["index"] + json.dump(temp_conf, f) + + if build: + if force: + p = subprocess.Popen([ann_executable_path, "-b", "-f", + temp_conf_filepath]) + p.wait() + else: + p = subprocess.Popen([ann_executable_path, "-b", + temp_conf_filepath]) + p.wait() + + if search: + if force: + p = subprocess.Popen([ann_executable_path, "-s", "-f", + temp_conf_filepath]) + p.wait() + else: + p = subprocess.Popen([ann_executable_path, "-s", + temp_conf_filepath]) + p.wait() + + os.remove(temp_conf_filepath) + + +def main(): + scripts_path = os.path.dirname(os.path.realpath(__file__)) + # Read list of allowed algorithms + with open(f"{scripts_path}/algos.yaml", "r") as f: + algos_conf = yaml.safe_load(f) + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--configuration", + help="path to configuration file for a dataset", + required=True + ) + parser.add_argument( + "--build", + action="store_true" + ) + parser.add_argument( + "--search", + action="store_true" + ) + parser.add_argument("--algorithms", + help="run only comma separated list of named \ + algorithms", + default=None) + parser.add_argument("--indices", + help="run only comma separated list of named indices. \ + parameter `algorithms` is ignored", + default=None) + parser.add_argument("--force", + help="re-run algorithms even if their results \ + already exist", + action="store_true") + + args = parser.parse_args() + + # If both build and search are not provided, + # run both + if not args.build and not args.search: + build = True + search = True + else: + if args.build: + build = args.build + if args.search: + search = args.search + + # Read configuration file associated to dataset + conf_filepath = args.configuration + conf_filename = conf_filepath.split("/")[-1] + if not os.path.exists(conf_filepath): + raise FileNotFoundError(conf_filename) + + with open(conf_filepath, "r") as f: + conf_file = json.load(f) + + # Ensure base and query files exist for dataset + if not os.path.exists(conf_file["dataset"]["base_file"]): + raise FileNotFoundError(conf_file["dataset"]["base_file"]) + if not os.path.exists(conf_file["dataset"]["query_file"]): + raise FileNotFoundError(conf_file["dataset"]["query_file"]) + + executables_to_run = dict() + # At least one named index should exist in config file + if args.indices: + indices = set(args.indices.split(",")) + # algo associated with index should still be present in algos.yaml + # and enabled + for index in conf_file["index"]: + curr_algo = index["algo"] + if index["name"] in indices and \ + validate_algorithm(algos_conf, curr_algo): + executable_path = find_executable(algos_conf, curr_algo) + if executable_path not in executables_to_run: + executables_to_run[executable_path] = {"index": []} + executables_to_run[executable_path]["index"].append(index) + + # switch to named algorithms if indices parameter is not supplied + elif args.algorithms: + algorithms = set(args.algorithms.split(",")) + # pick out algorithms from conf file that exist + # and are enabled in algos.yaml + for index in conf_file["index"]: + curr_algo = index["algo"] + if curr_algo in algorithms and \ + validate_algorithm(algos_conf, curr_algo): + executable_path = find_executable(algos_conf, curr_algo) + if executable_path not in executables_to_run: + executables_to_run[executable_path] = {"index": []} + executables_to_run[executable_path]["index"].append(index) + + # default, try to run all available algorithms + else: + for index in conf_file["index"]: + curr_algo = index["algo"] + if validate_algorithm(algos_conf, curr_algo): + executable_path = find_executable(algos_conf, curr_algo) + if executable_path not in executables_to_run: + executables_to_run[executable_path] = {"index": []} + executables_to_run[executable_path]["index"].append(index) + + run_build_and_search(conf_filename, conf_file, executables_to_run, + args.force, ann_bench_path, build, search) + + +if __name__ == "__main__": + main() diff --git a/thirdparty/LICENSES/LICENSE.ann-benchmark b/thirdparty/LICENSES/LICENSE.ann-benchmark new file mode 100644 index 0000000000..9f8e4222f6 --- /dev/null +++ b/thirdparty/LICENSES/LICENSE.ann-benchmark @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Erik Bernhardsson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file