ANN benchmarks python wrapper for splitting billion-scale dataset gro…

…undtruth (#1679) Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #1679
rapidsai · Jul 26, 2023 · acd40f5 · acd40f5
1 parent 617d33a
commit acd40f5
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 16 deletions.
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -27,9 +27,11 @@ There are 4 general steps to running the benchmarks and vizualizing the results:
 We provide a collection of lightweight Python scripts that are wrappers over
 lower level scripts and executables to run our benchmarks. Either Python scripts or
 [low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks,
-however plots are only provided through our Python scripts.
+however plots are only provided through our Python scripts. An environment variable `RAFT_HOME` is
+expected to be defined to run these scripts; this variable holds the directory where RAFT is cloned.
 ### End-to-end example: Million-scale
 ```bash
+export RAFT_HOME=$(pwd)
 # All scripts are present in directory raft/scripts/ann-benchmarks
 
 # (1) prepare dataset
@@ -53,21 +55,34 @@ billion-scale dataset has been downloaded.
 To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html)
 
 ```bash
-mkdir -p data/deep-1B && cd data/deep-1B
+export RAFT_HOME=$(pwd)
+# All scripts are present in directory raft/scripts/ann-benchmarks
+
+mkdir -p data/deep-1B
 # (1) prepare dataset
 # download manually "Ground Truth" file of "Yandex DEEP"
 # suppose the file name is deep_new_groundtruth.public.10K.bin
-../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin
 # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
 # (2) build and search index
-python scripts/run.py --configuration conf/deep-1B.json
+python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json
 
 # (3) evaluate results
-python scripts/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/
+python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/
 
 # (4) plot results
-python scripts/plot.py --result_csv out.csv
+python scripts/ann-benchmarks/plot.py --result_csv out.csv
+```
+
+The usage of `scripts/ann-benchmarks/split-groundtruth.py` is:
+```bash
+usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH
+
+options:
+  -h, --help            show this help message and exit
+  --groundtruth GROUNDTRUTH
+                        Path to billion-scale dataset groundtruth file (default: None)
 ```
 
 ##### Step 1: Prepare Dataset<a id='prep-dataset'></a>

diff --git a/scripts/ann-benchmarks/data_export.py b/scripts/ann-benchmarks/data_export.py
@@ -21,9 +21,9 @@
 def export_results(output_filepath, recompute, groundtruth_filepath,
                    result_filepaths):
     print(f"Writing output file to: {output_filepath}")
-    ann_bench_scripts_dir = "cpp/bench/ann/scripts"
-    ann_bench_scripts_path = os.path.join(os.getcwd(),
-                                          ann_bench_scripts_dir,
+    ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
+                                         "cpp/bench/ann/scripts")
+    ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
                                           "eval.pl")
     if recompute:
         p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath,

diff --git a/scripts/ann-benchmarks/get_dataset.py b/scripts/ann-benchmarks/get_dataset.py
@@ -32,8 +32,9 @@ def download_dataset(url, path):
 
 
 def convert_hdf5_to_fbin(path, normalize):
-    ann_bench_scripts_dir = "cpp/bench/ann/scripts"
-    ann_bench_scripts_path = os.path.join(os.getcwd(), ann_bench_scripts_dir,
+    ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
+                                         "cpp/bench/ann/scripts")
+    ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
                                           "hdf5_to_fbin.py")
     if normalize and "angular" in path:
         p = subprocess.Popen(["python", ann_bench_scripts_path, "-n",

diff --git a/scripts/ann-benchmarks/run.py b/scripts/ann-benchmarks/run.py
@@ -29,7 +29,7 @@ def find_executable(algos_conf, algo):
     executable = algos_conf[algo]["executable"]
     conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann",
                               executable)
-    build_path = os.path.join(os.getcwd(), "cpp", "build", executable)
+    build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
     if os.path.exists(conda_path):
         return (executable, conda_path)
     elif os.path.exists(build_path):
@@ -39,12 +39,11 @@ def find_executable(algos_conf, algo):
 
 
 def run_build_and_search(conf_filename, conf_file, executables_to_run,
-                         force, ann_bench_path, build, search):
+                         force, conf_filedir, build, search):
     for executable, ann_executable_path in executables_to_run.keys():
         # Need to write temporary configuration
         temp_conf_filename = f"temporary_executable_{conf_filename}"
-        temp_conf_filepath = os.path.join(ann_bench_path, "conf",
-                                          temp_conf_filename)
+        temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
         with open(temp_conf_filepath, "w") as f:
             temp_conf = dict()
             temp_conf["dataset"] = conf_file["dataset"]
@@ -126,6 +125,7 @@ def main():
     # Read configuration file associated to dataset
     conf_filepath = args.configuration
     conf_filename = conf_filepath.split("/")[-1]
+    conf_filedir = "/".join(conf_filepath.split("/")[:-1])
     if not os.path.exists(conf_filepath):
         raise FileNotFoundError(conf_filename)
 
@@ -178,7 +178,7 @@ def main():
                 executables_to_run[executable_path]["index"].append(index)
 
     run_build_and_search(conf_filename, conf_file, executables_to_run,
-                         args.force, ann_bench_path, build, search)
+                         args.force, conf_filedir, build, search)
 
 
 if __name__ == "__main__":

diff --git a/scripts/ann-benchmarks/split_groundtruth.py b/scripts/ann-benchmarks/split_groundtruth.py
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+
+
+def split_groundtruth(groundtruth_filepath):
+    ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
+                                         "cpp/bench/ann/scripts")
+    ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
+                                          "split_groundtruth.pl")
+    pwd = os.getcwd()
+    os.chdir("/".join(groundtruth_filepath.split("/")[:-1]))
+    groundtruth_filename = groundtruth_filepath.split("/")[-1]
+    p = subprocess.Popen([ann_bench_scripts_path, groundtruth_filename, 
+                          "groundtruth"])
+    p.wait()
+    os.chdir(pwd)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--groundtruth",
+                        help="Path to billion-scale dataset groundtruth file",
+                        required=True)
+    args = parser.parse_args()
+
+    split_groundtruth(args.groundtruth)
+
+
+if __name__ == "__main__":
+    main()