Skip to content

Commit

Permalink
ANN benchmarks python wrapper for splitting billion-scale dataset gro…
Browse files Browse the repository at this point in the history
…undtruth (#1679)

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #1679
  • Loading branch information
divyegala authored Jul 26, 2023
1 parent 617d33a commit acd40f5
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 16 deletions.
27 changes: 21 additions & 6 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ There are 4 general steps to running the benchmarks and vizualizing the results:
We provide a collection of lightweight Python scripts that are wrappers over
lower level scripts and executables to run our benchmarks. Either Python scripts or
[low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks,
however plots are only provided through our Python scripts.
however plots are only provided through our Python scripts. An environment variable `RAFT_HOME` is
expected to be defined to run these scripts; this variable holds the directory where RAFT is cloned.
### End-to-end example: Million-scale
```bash
export RAFT_HOME=$(pwd)
# All scripts are present in directory raft/scripts/ann-benchmarks

# (1) prepare dataset
Expand All @@ -53,21 +55,34 @@ billion-scale dataset has been downloaded.
To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html)

```bash
mkdir -p data/deep-1B && cd data/deep-1B
export RAFT_HOME=$(pwd)
# All scripts are present in directory raft/scripts/ann-benchmarks

mkdir -p data/deep-1B
# (1) prepare dataset
# download manually "Ground Truth" file of "Yandex DEEP"
# suppose the file name is deep_new_groundtruth.public.10K.bin
../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin
# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced

# (2) build and search index
python scripts/run.py --configuration conf/deep-1B.json
python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json

# (3) evaluate results
python scripts/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/
python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/

# (4) plot results
python scripts/plot.py --result_csv out.csv
python scripts/ann-benchmarks/plot.py --result_csv out.csv
```

The usage of `scripts/ann-benchmarks/split-groundtruth.py` is:
```bash
usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH

options:
-h, --help show this help message and exit
--groundtruth GROUNDTRUTH
Path to billion-scale dataset groundtruth file (default: None)
```

##### Step 1: Prepare Dataset<a id='prep-dataset'></a>
Expand Down
6 changes: 3 additions & 3 deletions scripts/ann-benchmarks/data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
def export_results(output_filepath, recompute, groundtruth_filepath,
result_filepaths):
print(f"Writing output file to: {output_filepath}")
ann_bench_scripts_dir = "cpp/bench/ann/scripts"
ann_bench_scripts_path = os.path.join(os.getcwd(),
ann_bench_scripts_dir,
ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
"cpp/bench/ann/scripts")
ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
"eval.pl")
if recompute:
p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath,
Expand Down
5 changes: 3 additions & 2 deletions scripts/ann-benchmarks/get_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ def download_dataset(url, path):


def convert_hdf5_to_fbin(path, normalize):
ann_bench_scripts_dir = "cpp/bench/ann/scripts"
ann_bench_scripts_path = os.path.join(os.getcwd(), ann_bench_scripts_dir,
ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
"cpp/bench/ann/scripts")
ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
"hdf5_to_fbin.py")
if normalize and "angular" in path:
p = subprocess.Popen(["python", ann_bench_scripts_path, "-n",
Expand Down
10 changes: 5 additions & 5 deletions scripts/ann-benchmarks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def find_executable(algos_conf, algo):
executable = algos_conf[algo]["executable"]
conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann",
executable)
build_path = os.path.join(os.getcwd(), "cpp", "build", executable)
build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
if os.path.exists(conda_path):
return (executable, conda_path)
elif os.path.exists(build_path):
Expand All @@ -39,12 +39,11 @@ def find_executable(algos_conf, algo):


def run_build_and_search(conf_filename, conf_file, executables_to_run,
force, ann_bench_path, build, search):
force, conf_filedir, build, search):
for executable, ann_executable_path in executables_to_run.keys():
# Need to write temporary configuration
temp_conf_filename = f"temporary_executable_{conf_filename}"
temp_conf_filepath = os.path.join(ann_bench_path, "conf",
temp_conf_filename)
temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
with open(temp_conf_filepath, "w") as f:
temp_conf = dict()
temp_conf["dataset"] = conf_file["dataset"]
Expand Down Expand Up @@ -126,6 +125,7 @@ def main():
# Read configuration file associated to dataset
conf_filepath = args.configuration
conf_filename = conf_filepath.split("/")[-1]
conf_filedir = "/".join(conf_filepath.split("/")[:-1])
if not os.path.exists(conf_filepath):
raise FileNotFoundError(conf_filename)

Expand Down Expand Up @@ -178,7 +178,7 @@ def main():
executables_to_run[executable_path]["index"].append(index)

run_build_and_search(conf_filename, conf_file, executables_to_run,
args.force, ann_bench_path, build, search)
args.force, conf_filedir, build, search)


if __name__ == "__main__":
Expand Down
47 changes: 47 additions & 0 deletions scripts/ann-benchmarks/split_groundtruth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess


def split_groundtruth(groundtruth_filepath):
ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
"cpp/bench/ann/scripts")
ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
"split_groundtruth.pl")
pwd = os.getcwd()
os.chdir("/".join(groundtruth_filepath.split("/")[:-1]))
groundtruth_filename = groundtruth_filepath.split("/")[-1]
p = subprocess.Popen([ann_bench_scripts_path, groundtruth_filename,
"groundtruth"])
p.wait()
os.chdir(pwd)


def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--groundtruth",
help="Path to billion-scale dataset groundtruth file",
required=True)
args = parser.parse_args()

split_groundtruth(args.groundtruth)


if __name__ == "__main__":
main()

0 comments on commit acd40f5

Please sign in to comment.