Skip to content

Commit

Permalink
Improvements to ANN Benchmark Python scripts and docs (#1734)
Browse files Browse the repository at this point in the history
Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ray Douglass (https://github.com/raydouglass)

URL: #1734
  • Loading branch information
divyegala authored Aug 11, 2023
1 parent b3bb21a commit 25b6916
Show file tree
Hide file tree
Showing 14 changed files with 92 additions and 36 deletions.
2 changes: 1 addition & 1 deletion scripts/ann-benchmarks/algos.yaml → bench/ann/algos.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
faise_gpu_ivf_flat:
faiss_gpu_ivf_flat:
executable: FAISS_IVF_FLAT_ANN_BENCH
disabled: false
faiss_gpu_flat:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
16 changes: 12 additions & 4 deletions scripts/ann-benchmarks/data_export.py → bench/ann/data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,24 @@ def main():
required=True)
parser.add_argument("--recompute", action="store_true",
help="Recompute metrics")
parser.add_argument("--groundtruth",
help="Path to groundtruth.neighbors.ibin file for a dataset",
required=True)
parser.add_argument("--dataset",
help="Name of the dataset to export results for",
default="glove-100-inner")
parser.add_argument(
"--dataset-path",
help="path to dataset folder",
default=os.path.join(os.getenv("RAFT_HOME"),
"bench", "ann", "data")
)

args, result_filepaths = parser.parse_known_args()

# if nothing is provided
if len(result_filepaths) == 0:
raise ValueError("No filepaths to results were provided")

groundtruth_filepath = args.groundtruth
groundtruth_filepath = os.path.join(args.dataset_path, args.dataset,
"groundtruth.neighbors.ibin")
export_results(args.output, args.recompute, groundtruth_filepath,
result_filepaths)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,17 @@ def download(name, normalize, ann_bench_data_path):
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--name", help="dataset to download",
parser.add_argument("--dataset", help="dataset to download",
default="glove-100-angular")
parser.add_argument("--path", help="path to download dataset",
default=os.path.join(os.getcwd(), "data"))
parser.add_argument("--dataset-path", help="path to download dataset",
default=os.path.join(os.getenv("RAFT_HOME"),
"bench", "ann", "data"))
parser.add_argument("--normalize",
help="normalize cosine distance to inner product",
action="store_true")
args = parser.parse_args()

download(args.name, args.normalize, args.path)
download(args.dataset, args.normalize, args.dataset_path)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion scripts/ann-benchmarks/plot.py → bench/ann/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def load_all_results(result_filepath):
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--result_csv", help="Path to CSV Results", required=True)
parser.add_argument("--result-csv", help="Path to CSV Results", required=True)
parser.add_argument("--output", help="Path to the PNG output file",
default=f"{os.getcwd()}/out.png")
parser.add_argument(
Expand Down
30 changes: 28 additions & 2 deletions scripts/ann-benchmarks/run.py → bench/ann/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,17 @@ def main():
parser.add_argument(
"--configuration",
help="path to configuration file for a dataset",
required=True
)
parser.add_argument(
"--dataset",
help="dataset whose configuration file will be used",
default="glove-100-inner"
)
parser.add_argument(
"--dataset-path",
help="path to dataset folder",
default=os.path.join(os.getenv("RAFT_HOME"),
"bench", "ann", "data")
)
parser.add_argument(
"--build",
Expand Down Expand Up @@ -121,15 +131,23 @@ def main():
search = args.search

# Read configuration file associated to dataset
conf_filepath = args.configuration
if args.configuration:
conf_filepath = args.configuration
else:
conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json")
conf_filename = conf_filepath.split("/")[-1]
conf_filedir = "/".join(conf_filepath.split("/")[:-1])
dataset_name = conf_filename.replace(".json", "")
dataset_path = os.path.join(args.dataset_path, dataset_name)
if not os.path.exists(conf_filepath):
raise FileNotFoundError(conf_filename)

with open(conf_filepath, "r") as f:
conf_file = json.load(f)

# Replace base, query to dataset-path
conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin")
conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin")
# Ensure base and query files exist for dataset
if not os.path.exists(conf_file["dataset"]["base_file"]):
raise FileNotFoundError(conf_file["dataset"]["base_file"])
Expand Down Expand Up @@ -175,6 +193,14 @@ def main():
executables_to_run[executable_path] = {"index": []}
executables_to_run[executable_path]["index"].append(index)

# Replace build, search to dataset path
for executable_path in executables_to_run:
for pos, index in enumerate(executables_to_run[executable_path]["index"]):
index["file"] = os.path.join(dataset_path, "index", index["name"])
index["search_result_file"] = \
os.path.join(dataset_path, "result", index["name"])
executables_to_run[executable_path]["index"][pos] = index

run_build_and_search(conf_filename, conf_file, executables_to_run,
args.force, conf_filedir, build, search)

Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dependencies:
- nccl>=2.9.9
- ninja
- nlohmann_json>=3.11.2
- pyyaml
- scikit-build>=0.13.1
- sysroot_linux-64==2.17
name: bench_ann_cuda-118_arch-x86_64
1 change: 1 addition & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ dependencies:
- libfaiss>=1.7.1
- faiss-proc=*=cuda
- matplotlib
- pyyaml

cudatoolkit:
specific:
Expand Down
67 changes: 43 additions & 24 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ The easiest way to install these benchmarks is through conda. We suggest using m
mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
conda activate raft_ann_benchmarks

mamba install -c rapidsai libraft-ann-bench
mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-ann-bench cudatoolkit=11.8*

git clone https://github.com/rapidsai/raft.git && cd raft
export RAFT_HOME=$(pwd)
```
The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly benchmarks are desired.

Expand All @@ -35,16 +38,16 @@ export RAFT_HOME=$(pwd)
# All scripts are present in directory raft/scripts/ann-benchmarks

# (1) prepare dataset
python scripts/ann-benchmarks/get_dataset.py --name glove-100-angular --normalize
python scripts/ann-benchmarks/get_dataset.py --dataset glove-100-angular --normalize

# (2) build and search index
python scripts/ann-benchmarks/run.py --configuration conf/glove-100-inner.json
python scripts/ann-benchmarks/run.py --configuration bench/ann/conf/glove-100-inner.json

# (3) evaluate results
python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/glove-100-inner/groundtruth.neighbors.ibin result/glove-100-inner/
python scripts/ann-benchmarks/data_export.py --output out.csv --dataset glove-100-inner result/glove-100-inner/

# (4) plot results
python scripts/ann-benchmarks/plot.py --result_csv out.csv
python scripts/ann-benchmarks/plot.py --result-csv out.csv
```

### End-to-end example: Billion-scale
Expand All @@ -62,17 +65,17 @@ mkdir -p data/deep-1B
# (1) prepare dataset
# download manually "Ground Truth" file of "Yandex DEEP"
# suppose the file name is deep_new_groundtruth.public.10K.bin
python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin
python scripts/ann-benchmarks/split_groundtruth.py --groundtruth data/deep-1B/deep_new_groundtruth.public.10K.bin
# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced

# (2) build and search index
python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json
python scripts/ann-benchmarks/run.py --configuration bench/ann/conf/deep-1B.json

# (3) evaluate results
python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/
python scripts/ann-benchmarks/data_export.py --output out.csv --dataset deep-1B result/deep-1B/

# (4) plot results
python scripts/ann-benchmarks/plot.py --result_csv out.csv
python scripts/ann-benchmarks/plot.py --result-csv out.csv
```

The usage of `scripts/ann-benchmarks/split-groundtruth.py` is:
Expand All @@ -92,18 +95,18 @@ script. For more information on [datasets and formats](ann_benchmarks_dataset.md

The usage of this script is:
```bash
usage: get_dataset.py [-h] [--name NAME] [--path PATH] [--normalize]
usage: get_dataset.py [-h] [--name NAME] [--dataset-path DATASET_PATH] [--normalize]

options:
-h, --help show this help message and exit
--name NAME dataset to download (default: glove-100-angular)
--path PATH path to download dataset (default: {os.getcwd()}/data)
--normalize normalize cosine distance to inner product (default: False)
```
-h, --help show this help message and exit
--dataset DATASET dataset to download (default: glove-100-angular)
--dataset-path DATASET_PATH
path to download dataset (default: ${RAFT_HOME}/bench/ann/data)
--normalize normalize cosine distance to inner product (default: False)

When option `normalize` is provided to the script, any dataset that has cosine distances
will be normalized to inner product. So, for example, the dataset `glove-100-angular`
will be written at location `data/glove-100-inner/`.
will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`.

#### Step 2: Build and Search Index
The script `scripts/ann-benchmarks/run.py` will build and search indices for a given dataset and its
Expand All @@ -122,19 +125,34 @@ available in `raft/cpp/build/`.

The usage of the script `scripts/run.py` is:
```bash
usage: run.py [-h] --configuration CONFIGURATION [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [--force]
usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f]
options:
usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f]
options:
-h, --help show this help message and exit
--configuration CONFIGURATION
path to configuration file for a dataset (default: None)
--dataset DATASET dataset whose configuration file will be used (default: glove-100-inner)
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
--build
--search
--algorithms ALGORITHMS
run only comma separated list of named algorithms (default: None)
--indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None)
--force re-run algorithms even if their results already exist (default: False)
-f, --force re-run algorithms even if their results already exist (default: False)
```
`configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset.
The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
provided with the `dataset` argument, in which case
a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`
`dataset-path` :
1. data is read from `<dataset-path>/<dataset>`
2. indices are built in `<dataset-path>/<dataset>/index`
3. search results are stored in `<dataset-path>/<dataset>/result`
`build` and `search` : if both parameters are not supplied to the script then
it is assumed both are `True`.
Expand All @@ -149,17 +167,18 @@ will be combined and written to a CSV file.
The usage of this script is:
```bash
usage: data_export.py [-h] --output OUTPUT [--recompute] --groundtruth GROUNDTRUTH <result_filepaths>
usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH] <result-filepaths>
options:
-h, --help show this help message and exit
--output OUTPUT Path to the CSV output file (default: None)
--recompute Recompute metrics (default: False)
--groundtruth GROUNDTRUTH
Path to groundtruth.neighbors.ibin file for a dataset (default: None)
--dataset DATASET Name of the dataset to export results for (default: glove-100-inner)
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
```
`result_filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example)
`result-filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example)
#### Step 4: Plot Results
The script `scripts/ann-benchmarks/plot.py` will plot all results evaluated to a CSV file for a given dataset.
Expand All @@ -170,9 +189,9 @@ usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE
options:
-h, --help show this help message and exit
--result_csv RESULT_CSV
--result-csv RESULT_CSV
Path to CSV Results (default: None)
--output OUTPUT Path to the PNG output file (default: /home/nfs/dgala/raft/out.png)
--output OUTPUT Path to the PNG output file (default: ${RAFT_HOME}/out.png)
--x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
--y-scale {linear,log,symlog,logit}
Scale to use when drawing the Y-axis (default: linear)
Expand Down

0 comments on commit 25b6916

Please sign in to comment.