Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ANN benchmarks python wrapper for splitting billion-scale dataset groundtruth #1679

Merged
merged 5 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ There are 4 general steps to running the benchmarks and vizualizing the results:
We provide a collection of lightweight Python scripts that are wrappers over
lower level scripts and executables to run our benchmarks. Either Python scripts or
[low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks,
however plots are only provided through our Python scripts.
however plots are only provided through our Python scripts. An environment variable `RAFT_HOME` is
expected to be defined to run these scripts; this variable holds the directory where RAFT is cloned.
### End-to-end example: Million-scale
```bash
export RAFT_HOME=$(pwd)
# All scripts are present in directory raft/scripts/ann-benchmarks

# (1) prepare dataset
Expand All @@ -53,21 +55,34 @@ billion-scale dataset has been downloaded.
To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html)

```bash
mkdir -p data/deep-1B && cd data/deep-1B
export RAFT_HOME=$(pwd)
# All scripts are present in directory raft/scripts/ann-benchmarks

mkdir -p data/deep-1B
# (1) prepare dataset
# download manually "Ground Truth" file of "Yandex DEEP"
# suppose the file name is deep_new_groundtruth.public.10K.bin
../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin
# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced

# (2) build and search index
python scripts/run.py --configuration conf/deep-1B.json
python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json

# (3) evaluate results
python scripts/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/
python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/

# (4) plot results
python scripts/plot.py --result_csv out.csv
python scripts/ann-benchmarks/plot.py --result_csv out.csv
```

The usage of `scripts/ann-benchmarks/split-groundtruth.py` is:
```bash
usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH

options:
-h, --help show this help message and exit
--groundtruth GROUNDTRUTH
Path to billion-scale dataset groundtruth file (default: None)
```

##### Step 1: Prepare Dataset<a id='prep-dataset'></a>
Expand Down
6 changes: 3 additions & 3 deletions scripts/ann-benchmarks/data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
def export_results(output_filepath, recompute, groundtruth_filepath,
result_filepaths):
print(f"Writing output file to: {output_filepath}")
ann_bench_scripts_dir = "cpp/bench/ann/scripts"
ann_bench_scripts_path = os.path.join(os.getcwd(),
ann_bench_scripts_dir,
ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
"cpp/bench/ann/scripts")
ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
"eval.pl")
if recompute:
p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath,
Expand Down
5 changes: 3 additions & 2 deletions scripts/ann-benchmarks/get_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ def download_dataset(url, path):


def convert_hdf5_to_fbin(path, normalize):
ann_bench_scripts_dir = "cpp/bench/ann/scripts"
ann_bench_scripts_path = os.path.join(os.getcwd(), ann_bench_scripts_dir,
ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
"cpp/bench/ann/scripts")
ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
"hdf5_to_fbin.py")
if normalize and "angular" in path:
p = subprocess.Popen(["python", ann_bench_scripts_path, "-n",
Expand Down
10 changes: 5 additions & 5 deletions scripts/ann-benchmarks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def find_executable(algos_conf, algo):
executable = algos_conf[algo]["executable"]
conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann",
executable)
build_path = os.path.join(os.getcwd(), "cpp", "build", executable)
build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
if os.path.exists(conda_path):
return (executable, conda_path)
elif os.path.exists(build_path):
Expand All @@ -39,12 +39,11 @@ def find_executable(algos_conf, algo):


def run_build_and_search(conf_filename, conf_file, executables_to_run,
force, ann_bench_path, build, search):
force, conf_filedir, build, search):
for executable, ann_executable_path in executables_to_run.keys():
# Need to write temporary configuration
temp_conf_filename = f"temporary_executable_{conf_filename}"
temp_conf_filepath = os.path.join(ann_bench_path, "conf",
temp_conf_filename)
temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
with open(temp_conf_filepath, "w") as f:
temp_conf = dict()
temp_conf["dataset"] = conf_file["dataset"]
Expand Down Expand Up @@ -126,6 +125,7 @@ def main():
# Read configuration file associated to dataset
conf_filepath = args.configuration
conf_filename = conf_filepath.split("/")[-1]
conf_filedir = "/".join(conf_filepath.split("/")[:-1])
if not os.path.exists(conf_filepath):
raise FileNotFoundError(conf_filename)

Expand Down Expand Up @@ -178,7 +178,7 @@ def main():
executables_to_run[executable_path]["index"].append(index)

run_build_and_search(conf_filename, conf_file, executables_to_run,
args.force, ann_bench_path, build, search)
args.force, conf_filedir, build, search)


if __name__ == "__main__":
Expand Down
47 changes: 47 additions & 0 deletions scripts/ann-benchmarks/split_groundtruth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess


def split_groundtruth(groundtruth_filepath):
ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"),
"cpp/bench/ann/scripts")
ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir,
"split_groundtruth.pl")
pwd = os.getcwd()
os.chdir("/".join(groundtruth_filepath.split("/")[:-1]))
groundtruth_filename = groundtruth_filepath.split("/")[-1]
p = subprocess.Popen([ann_bench_scripts_path, groundtruth_filename,
"groundtruth"])
p.wait()
os.chdir(pwd)


def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--groundtruth",
help="Path to billion-scale dataset groundtruth file",
required=True)
args = parser.parse_args()

split_groundtruth(args.groundtruth)


if __name__ == "__main__":
main()