From acd40f57f2dfbe659d0a876455333fb7e8bceb11 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 26 Jul 2023 17:59:40 -0400 Subject: [PATCH] ANN benchmarks python wrapper for splitting billion-scale dataset groundtruth (#1679) Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1679 --- docs/source/raft_ann_benchmarks.md | 27 +++++++++--- scripts/ann-benchmarks/data_export.py | 6 +-- scripts/ann-benchmarks/get_dataset.py | 5 ++- scripts/ann-benchmarks/run.py | 10 ++--- scripts/ann-benchmarks/split_groundtruth.py | 47 +++++++++++++++++++++ 5 files changed, 79 insertions(+), 16 deletions(-) create mode 100644 scripts/ann-benchmarks/split_groundtruth.py diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index df0c21dd7b..91958c0bcd 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -27,9 +27,11 @@ There are 4 general steps to running the benchmarks and vizualizing the results: We provide a collection of lightweight Python scripts that are wrappers over lower level scripts and executables to run our benchmarks. Either Python scripts or [low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks, -however plots are only provided through our Python scripts. +however plots are only provided through our Python scripts. An environment variable `RAFT_HOME` is +expected to be defined to run these scripts; this variable holds the directory where RAFT is cloned. ### End-to-end example: Million-scale ```bash +export RAFT_HOME=$(pwd) # All scripts are present in directory raft/scripts/ann-benchmarks # (1) prepare dataset @@ -53,21 +55,34 @@ billion-scale dataset has been downloaded. To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html) ```bash -mkdir -p data/deep-1B && cd data/deep-1B +export RAFT_HOME=$(pwd) +# All scripts are present in directory raft/scripts/ann-benchmarks + +mkdir -p data/deep-1B # (1) prepare dataset # download manually "Ground Truth" file of "Yandex DEEP" # suppose the file name is deep_new_groundtruth.public.10K.bin -../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth +python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced # (2) build and search index -python scripts/run.py --configuration conf/deep-1B.json +python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json # (3) evaluate results -python scripts/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/ +python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/ # (4) plot results -python scripts/plot.py --result_csv out.csv +python scripts/ann-benchmarks/plot.py --result_csv out.csv +``` + +The usage of `scripts/ann-benchmarks/split-groundtruth.py` is: +```bash +usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH + +options: + -h, --help show this help message and exit + --groundtruth GROUNDTRUTH + Path to billion-scale dataset groundtruth file (default: None) ``` ##### Step 1: Prepare Dataset diff --git a/scripts/ann-benchmarks/data_export.py b/scripts/ann-benchmarks/data_export.py index 18c6a1a4d2..5be73bef11 100644 --- a/scripts/ann-benchmarks/data_export.py +++ b/scripts/ann-benchmarks/data_export.py @@ -21,9 +21,9 @@ def export_results(output_filepath, recompute, groundtruth_filepath, result_filepaths): print(f"Writing output file to: {output_filepath}") - ann_bench_scripts_dir = "cpp/bench/ann/scripts" - ann_bench_scripts_path = os.path.join(os.getcwd(), - ann_bench_scripts_dir, + ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), + "cpp/bench/ann/scripts") + ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir, "eval.pl") if recompute: p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath, diff --git a/scripts/ann-benchmarks/get_dataset.py b/scripts/ann-benchmarks/get_dataset.py index c071296125..5c21a5e2e1 100644 --- a/scripts/ann-benchmarks/get_dataset.py +++ b/scripts/ann-benchmarks/get_dataset.py @@ -32,8 +32,9 @@ def download_dataset(url, path): def convert_hdf5_to_fbin(path, normalize): - ann_bench_scripts_dir = "cpp/bench/ann/scripts" - ann_bench_scripts_path = os.path.join(os.getcwd(), ann_bench_scripts_dir, + ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), + "cpp/bench/ann/scripts") + ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir, "hdf5_to_fbin.py") if normalize and "angular" in path: p = subprocess.Popen(["python", ann_bench_scripts_path, "-n", diff --git a/scripts/ann-benchmarks/run.py b/scripts/ann-benchmarks/run.py index 5a2be49af6..e2236dce81 100644 --- a/scripts/ann-benchmarks/run.py +++ b/scripts/ann-benchmarks/run.py @@ -29,7 +29,7 @@ def find_executable(algos_conf, algo): executable = algos_conf[algo]["executable"] conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann", executable) - build_path = os.path.join(os.getcwd(), "cpp", "build", executable) + build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable) if os.path.exists(conda_path): return (executable, conda_path) elif os.path.exists(build_path): @@ -39,12 +39,11 @@ def find_executable(algos_conf, algo): def run_build_and_search(conf_filename, conf_file, executables_to_run, - force, ann_bench_path, build, search): + force, conf_filedir, build, search): for executable, ann_executable_path in executables_to_run.keys(): # Need to write temporary configuration temp_conf_filename = f"temporary_executable_{conf_filename}" - temp_conf_filepath = os.path.join(ann_bench_path, "conf", - temp_conf_filename) + temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename) with open(temp_conf_filepath, "w") as f: temp_conf = dict() temp_conf["dataset"] = conf_file["dataset"] @@ -126,6 +125,7 @@ def main(): # Read configuration file associated to dataset conf_filepath = args.configuration conf_filename = conf_filepath.split("/")[-1] + conf_filedir = "/".join(conf_filepath.split("/")[:-1]) if not os.path.exists(conf_filepath): raise FileNotFoundError(conf_filename) @@ -178,7 +178,7 @@ def main(): executables_to_run[executable_path]["index"].append(index) run_build_and_search(conf_filename, conf_file, executables_to_run, - args.force, ann_bench_path, build, search) + args.force, conf_filedir, build, search) if __name__ == "__main__": diff --git a/scripts/ann-benchmarks/split_groundtruth.py b/scripts/ann-benchmarks/split_groundtruth.py new file mode 100644 index 0000000000..cd67d9c8b8 --- /dev/null +++ b/scripts/ann-benchmarks/split_groundtruth.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess + + +def split_groundtruth(groundtruth_filepath): + ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), + "cpp/bench/ann/scripts") + ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir, + "split_groundtruth.pl") + pwd = os.getcwd() + os.chdir("/".join(groundtruth_filepath.split("/")[:-1])) + groundtruth_filename = groundtruth_filepath.split("/")[-1] + p = subprocess.Popen([ann_bench_scripts_path, groundtruth_filename, + "groundtruth"]) + p.wait() + os.chdir(pwd) + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--groundtruth", + help="Path to billion-scale dataset groundtruth file", + required=True) + args = parser.parse_args() + + split_groundtruth(args.groundtruth) + + +if __name__ == "__main__": + main()