Improvements to ANN Benchmark Python scripts and docs (#1734)

Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Ray Douglass (https://github.com/raydouglass) URL: #1734
rapidsai · Aug 11, 2023 · 25b6916 · 25b6916
1 parent b3bb21a
commit 25b6916
Show file tree

Hide file tree

Showing 14 changed files with 92 additions and 36 deletions.
diff --git a/scripts/ann-benchmarks/algos.yaml → bench/ann/algos.yaml b/scripts/ann-benchmarks/algos.yaml → bench/ann/algos.yaml
@@ -1,4 +1,4 @@
-faise_gpu_ivf_flat:
+faiss_gpu_ivf_flat:
   executable: FAISS_IVF_FLAT_ANN_BENCH
   disabled: false
 faiss_gpu_flat:

diff --git a/cpp/bench/ann/conf/bigann-100M.json → bench/ann/conf/bigann-100M.json b/cpp/bench/ann/conf/bigann-100M.json → bench/ann/conf/bigann-100M.json
diff --git a/cpp/bench/ann/conf/deep-100M.json → bench/ann/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json → bench/ann/conf/deep-100M.json
diff --git a/cpp/bench/ann/conf/deep-1B.json → bench/ann/conf/deep-1B.json b/cpp/bench/ann/conf/deep-1B.json → bench/ann/conf/deep-1B.json
diff --git a/cpp/bench/ann/conf/glove-100-inner.json → bench/ann/conf/glove-100-inner.json b/cpp/bench/ann/conf/glove-100-inner.json → bench/ann/conf/glove-100-inner.json
diff --git a/cpp/bench/ann/conf/sift-128-euclidean.json → bench/ann/conf/sift-128-euclidean.json b/cpp/bench/ann/conf/sift-128-euclidean.json → bench/ann/conf/sift-128-euclidean.json
diff --git a/scripts/ann-benchmarks/data_export.py → bench/ann/data_export.py b/scripts/ann-benchmarks/data_export.py → bench/ann/data_export.py
@@ -41,16 +41,24 @@ def main():
                         required=True)
     parser.add_argument("--recompute", action="store_true",
                         help="Recompute metrics")
-    parser.add_argument("--groundtruth",
-                        help="Path to groundtruth.neighbors.ibin file for a dataset",
-                        required=True)
+    parser.add_argument("--dataset",
+                        help="Name of the dataset to export results for",
+                        default="glove-100-inner")
+    parser.add_argument(
+        "--dataset-path",
+        help="path to dataset folder",
+        default=os.path.join(os.getenv("RAFT_HOME"), 
+                             "bench", "ann", "data")
+    )
+
     args, result_filepaths = parser.parse_known_args()
 
     # if nothing is provided
     if len(result_filepaths) == 0:
         raise ValueError("No filepaths to results were provided")
 
-    groundtruth_filepath = args.groundtruth
+    groundtruth_filepath = os.path.join(args.dataset_path, args.dataset, 
+                                        "groundtruth.neighbors.ibin")
     export_results(args.output, args.recompute, groundtruth_filepath,
                    result_filepaths)
 

diff --git a/scripts/ann-benchmarks/get_dataset.py → bench/ann/get_dataset.py b/scripts/ann-benchmarks/get_dataset.py → bench/ann/get_dataset.py
@@ -76,16 +76,17 @@ def download(name, normalize, ann_bench_data_path):
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--name", help="dataset to download",
+    parser.add_argument("--dataset", help="dataset to download",
                         default="glove-100-angular")
-    parser.add_argument("--path", help="path to download dataset",
-                        default=os.path.join(os.getcwd(), "data"))
+    parser.add_argument("--dataset-path", help="path to download dataset",
+                        default=os.path.join(os.getenv("RAFT_HOME"), 
+                                             "bench", "ann", "data"))
     parser.add_argument("--normalize",
                         help="normalize cosine distance to inner product",
                         action="store_true")
     args = parser.parse_args()
 
-    download(args.name, args.normalize, args.path)
+    download(args.dataset, args.normalize, args.dataset_path)
 
 
 if __name__ == "__main__":

diff --git a/scripts/ann-benchmarks/plot.py → bench/ann/plot.py b/scripts/ann-benchmarks/plot.py → bench/ann/plot.py
@@ -208,7 +208,7 @@ def load_all_results(result_filepath):
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--result_csv", help="Path to CSV Results", required=True)
+    parser.add_argument("--result-csv", help="Path to CSV Results", required=True)
     parser.add_argument("--output", help="Path to the PNG output file",
                         default=f"{os.getcwd()}/out.png")
     parser.add_argument(

diff --git a/scripts/ann-benchmarks/run.py → bench/ann/run.py b/scripts/ann-benchmarks/run.py → bench/ann/run.py
@@ -86,7 +86,17 @@ def main():
     parser.add_argument(
         "--configuration",
         help="path to configuration file for a dataset",
-        required=True
+    )
+    parser.add_argument(
+        "--dataset",
+        help="dataset whose configuration file will be used",
+        default="glove-100-inner"
+    )
+    parser.add_argument(
+        "--dataset-path",
+        help="path to dataset folder",
+        default=os.path.join(os.getenv("RAFT_HOME"), 
+                             "bench", "ann", "data")
     )
     parser.add_argument(
         "--build",
@@ -121,15 +131,23 @@ def main():
         search = args.search
 
     # Read configuration file associated to dataset
-    conf_filepath = args.configuration
+    if args.configuration:
+        conf_filepath = args.configuration
+    else:
+        conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json")
     conf_filename = conf_filepath.split("/")[-1]
     conf_filedir = "/".join(conf_filepath.split("/")[:-1])
+    dataset_name = conf_filename.replace(".json", "")
+    dataset_path = os.path.join(args.dataset_path, dataset_name)
     if not os.path.exists(conf_filepath):
         raise FileNotFoundError(conf_filename)
 
     with open(conf_filepath, "r") as f:
         conf_file = json.load(f)
 
+    # Replace base, query to dataset-path
+    conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin")
+    conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin")
     # Ensure base and query files exist for dataset
     if not os.path.exists(conf_file["dataset"]["base_file"]):
         raise FileNotFoundError(conf_file["dataset"]["base_file"])
@@ -175,6 +193,14 @@ def main():
                     executables_to_run[executable_path] = {"index": []}
                 executables_to_run[executable_path]["index"].append(index)
 
+    # Replace build, search to dataset path
+    for executable_path in executables_to_run:
+        for pos, index in enumerate(executables_to_run[executable_path]["index"]):
+            index["file"] = os.path.join(dataset_path, "index", index["name"])
+            index["search_result_file"] = \
+                os.path.join(dataset_path, "result", index["name"])
+            executables_to_run[executable_path]["index"][pos] = index
+
     run_build_and_search(conf_filename, conf_file, executables_to_run,
                          args.force, conf_filedir, build, search)
 

diff --git a/scripts/ann-benchmarks/split_groundtruth.py → bench/ann/split_groundtruth.py b/scripts/ann-benchmarks/split_groundtruth.py → bench/ann/split_groundtruth.py
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -34,6 +34,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
+- pyyaml
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -170,6 +170,7 @@ dependencies:
           - libfaiss>=1.7.1
           - faiss-proc=*=cuda
           - matplotlib
+          - pyyaml
 
   cudatoolkit:
     specific:

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -9,7 +9,10 @@ The easiest way to install these benchmarks is through conda. We suggest using m
 mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
 conda activate raft_ann_benchmarks
 
-mamba install -c rapidsai libraft-ann-bench
+mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-ann-bench cudatoolkit=11.8*
+
+git clone https://github.com/rapidsai/raft.git && cd raft
+export RAFT_HOME=$(pwd)
 ```
 The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly benchmarks are desired.
 
@@ -35,16 +38,16 @@ export RAFT_HOME=$(pwd)
 # All scripts are present in directory raft/scripts/ann-benchmarks
 
 # (1) prepare dataset
-python scripts/ann-benchmarks/get_dataset.py --name glove-100-angular --normalize
+python scripts/ann-benchmarks/get_dataset.py --dataset glove-100-angular --normalize
 
 # (2) build and search index
-python scripts/ann-benchmarks/run.py --configuration conf/glove-100-inner.json
+python scripts/ann-benchmarks/run.py --configuration bench/ann/conf/glove-100-inner.json
 
 # (3) evaluate results
-python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/glove-100-inner/groundtruth.neighbors.ibin result/glove-100-inner/
+python scripts/ann-benchmarks/data_export.py --output out.csv --dataset glove-100-inner result/glove-100-inner/
 
 # (4) plot results
-python scripts/ann-benchmarks/plot.py --result_csv out.csv
+python scripts/ann-benchmarks/plot.py --result-csv out.csv
 ```
 
 ### End-to-end example: Billion-scale
@@ -62,17 +65,17 @@ mkdir -p data/deep-1B
 # (1) prepare dataset
 # download manually "Ground Truth" file of "Yandex DEEP"
 # suppose the file name is deep_new_groundtruth.public.10K.bin
-python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin
+python scripts/ann-benchmarks/split_groundtruth.py --groundtruth data/deep-1B/deep_new_groundtruth.public.10K.bin
 # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
 # (2) build and search index
-python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json
+python scripts/ann-benchmarks/run.py --configuration bench/ann/conf/deep-1B.json
 
 # (3) evaluate results
-python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/
+python scripts/ann-benchmarks/data_export.py --output out.csv --dataset deep-1B result/deep-1B/
 
 # (4) plot results
-python scripts/ann-benchmarks/plot.py --result_csv out.csv
+python scripts/ann-benchmarks/plot.py --result-csv out.csv
 ```
 
 The usage of `scripts/ann-benchmarks/split-groundtruth.py` is:
@@ -92,18 +95,18 @@ script. For more information on [datasets and formats](ann_benchmarks_dataset.md
 
 The usage of this script is:
 ```bash
-usage: get_dataset.py [-h] [--name NAME] [--path PATH] [--normalize]
+usage: get_dataset.py [-h] [--name NAME] [--dataset-path DATASET_PATH] [--normalize]
 
 options:
-  -h, --help   show this help message and exit
-  --name NAME  dataset to download (default: glove-100-angular)
-  --path PATH  path to download dataset (default: {os.getcwd()}/data)
-  --normalize  normalize cosine distance to inner product (default: False)
-```
+  -h, --help            show this help message and exit
+  --dataset DATASET     dataset to download (default: glove-100-angular)
+  --dataset-path DATASET_PATH
+                        path to download dataset (default: ${RAFT_HOME}/bench/ann/data)
+  --normalize           normalize cosine distance to inner product (default: False)
 
 When option `normalize` is provided to the script, any dataset that has cosine distances
 will be normalized to inner product. So, for example, the dataset `glove-100-angular` 
-will be written at location `data/glove-100-inner/`.
+will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`.
 
 #### Step 2: Build and Search Index
 The script `scripts/ann-benchmarks/run.py` will build and search indices for a given dataset and its
@@ -122,19 +125,34 @@ available in `raft/cpp/build/`.
 
 The usage of the script `scripts/run.py` is:
 ```bash
-usage: run.py [-h] --configuration CONFIGURATION [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [--force]
+usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f]
+
+options:
+usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f]
 
 options:
   -h, --help            show this help message and exit
   --configuration CONFIGURATION
                         path to configuration file for a dataset (default: None)
+  --dataset DATASET     dataset whose configuration file will be used (default: glove-100-inner)
+  --dataset-path DATASET_PATH
+                        path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
   --build
   --search
   --algorithms ALGORITHMS
                         run only comma separated list of named algorithms (default: None)
   --indices INDICES     run only comma separated list of named indices. parameter `algorithms` is ignored (default: None)
-  --force               re-run algorithms even if their results already exist (default: False)
+  -f, --force           re-run algorithms even if their results already exist (default: False)
 ```
+`configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset.
+The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
+provided with the `dataset` argument, in which case
+a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`
+
+`dataset-path` : 
+1. data is read from `<dataset-path>/<dataset>`
+2. indices are built in `<dataset-path>/<dataset>/index`
+3. search results are stored in `<dataset-path>/<dataset>/result`
 
 `build` and `search` : if both parameters are not supplied to the script then
 it is assumed both are `True`.
@@ -149,17 +167,18 @@ will be combined and written to a CSV file.
 
 The usage of this script is:
 ```bash
-usage: data_export.py [-h] --output OUTPUT [--recompute] --groundtruth GROUNDTRUTH <result_filepaths>
+usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH] <result-filepaths>
 
 options:
   -h, --help            show this help message and exit
   --output OUTPUT       Path to the CSV output file (default: None)
   --recompute           Recompute metrics (default: False)
-  --groundtruth GROUNDTRUTH
-                        Path to groundtruth.neighbors.ibin file for a dataset (default: None)
+  --dataset DATASET     Name of the dataset to export results for (default: glove-100-inner)
+  --dataset-path DATASET_PATH
+                        path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
 ```
 
-`result_filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example)
+`result-filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example)
 
 #### Step 4: Plot Results
 The script `scripts/ann-benchmarks/plot.py` will plot all results evaluated to a CSV file for a given dataset.
@@ -170,9 +189,9 @@ usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE
 
 options:
   -h, --help            show this help message and exit
-  --result_csv RESULT_CSV
+  --result-csv RESULT_CSV
                         Path to CSV Results (default: None)
-  --output OUTPUT       Path to the PNG output file (default: /home/nfs/dgala/raft/out.png)
+  --output OUTPUT       Path to the PNG output file (default: ${RAFT_HOME}/out.png)
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)