diff --git a/scripts/ann-benchmarks/algos.yaml b/bench/ann/algos.yaml similarity index 97% rename from scripts/ann-benchmarks/algos.yaml rename to bench/ann/algos.yaml index 54fddf607b..5f554fc46b 100644 --- a/scripts/ann-benchmarks/algos.yaml +++ b/bench/ann/algos.yaml @@ -1,4 +1,4 @@ -faise_gpu_ivf_flat: +faiss_gpu_ivf_flat: executable: FAISS_IVF_FLAT_ANN_BENCH disabled: false faiss_gpu_flat: diff --git a/cpp/bench/ann/conf/bigann-100M.json b/bench/ann/conf/bigann-100M.json similarity index 100% rename from cpp/bench/ann/conf/bigann-100M.json rename to bench/ann/conf/bigann-100M.json diff --git a/cpp/bench/ann/conf/deep-100M.json b/bench/ann/conf/deep-100M.json similarity index 100% rename from cpp/bench/ann/conf/deep-100M.json rename to bench/ann/conf/deep-100M.json diff --git a/cpp/bench/ann/conf/deep-1B.json b/bench/ann/conf/deep-1B.json similarity index 100% rename from cpp/bench/ann/conf/deep-1B.json rename to bench/ann/conf/deep-1B.json diff --git a/cpp/bench/ann/conf/glove-100-inner.json b/bench/ann/conf/glove-100-inner.json similarity index 100% rename from cpp/bench/ann/conf/glove-100-inner.json rename to bench/ann/conf/glove-100-inner.json diff --git a/cpp/bench/ann/conf/sift-128-euclidean.json b/bench/ann/conf/sift-128-euclidean.json similarity index 100% rename from cpp/bench/ann/conf/sift-128-euclidean.json rename to bench/ann/conf/sift-128-euclidean.json diff --git a/scripts/ann-benchmarks/data_export.py b/bench/ann/data_export.py similarity index 80% rename from scripts/ann-benchmarks/data_export.py rename to bench/ann/data_export.py index 5be73bef11..df48882840 100644 --- a/scripts/ann-benchmarks/data_export.py +++ b/bench/ann/data_export.py @@ -41,16 +41,24 @@ def main(): required=True) parser.add_argument("--recompute", action="store_true", help="Recompute metrics") - parser.add_argument("--groundtruth", - help="Path to groundtruth.neighbors.ibin file for a dataset", - required=True) + parser.add_argument("--dataset", + help="Name of the dataset to export results for", + default="glove-100-inner") + parser.add_argument( + "--dataset-path", + help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data") + ) + args, result_filepaths = parser.parse_known_args() # if nothing is provided if len(result_filepaths) == 0: raise ValueError("No filepaths to results were provided") - groundtruth_filepath = args.groundtruth + groundtruth_filepath = os.path.join(args.dataset_path, args.dataset, + "groundtruth.neighbors.ibin") export_results(args.output, args.recompute, groundtruth_filepath, result_filepaths) diff --git a/scripts/ann-benchmarks/get_dataset.py b/bench/ann/get_dataset.py similarity index 89% rename from scripts/ann-benchmarks/get_dataset.py rename to bench/ann/get_dataset.py index 5c21a5e2e1..a175384dc3 100644 --- a/scripts/ann-benchmarks/get_dataset.py +++ b/bench/ann/get_dataset.py @@ -76,16 +76,17 @@ def download(name, normalize, ann_bench_data_path): def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--name", help="dataset to download", + parser.add_argument("--dataset", help="dataset to download", default="glove-100-angular") - parser.add_argument("--path", help="path to download dataset", - default=os.path.join(os.getcwd(), "data")) + parser.add_argument("--dataset-path", help="path to download dataset", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data")) parser.add_argument("--normalize", help="normalize cosine distance to inner product", action="store_true") args = parser.parse_args() - download(args.name, args.normalize, args.path) + download(args.dataset, args.normalize, args.dataset_path) if __name__ == "__main__": diff --git a/scripts/ann-benchmarks/plot.py b/bench/ann/plot.py similarity index 99% rename from scripts/ann-benchmarks/plot.py rename to bench/ann/plot.py index 772bdf8738..0020e398a9 100644 --- a/scripts/ann-benchmarks/plot.py +++ b/bench/ann/plot.py @@ -208,7 +208,7 @@ def load_all_results(result_filepath): def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--result_csv", help="Path to CSV Results", required=True) + parser.add_argument("--result-csv", help="Path to CSV Results", required=True) parser.add_argument("--output", help="Path to the PNG output file", default=f"{os.getcwd()}/out.png") parser.add_argument( diff --git a/scripts/ann-benchmarks/run.py b/bench/ann/run.py similarity index 84% rename from scripts/ann-benchmarks/run.py rename to bench/ann/run.py index e906b26e23..ebaef1e004 100644 --- a/scripts/ann-benchmarks/run.py +++ b/bench/ann/run.py @@ -86,7 +86,17 @@ def main(): parser.add_argument( "--configuration", help="path to configuration file for a dataset", - required=True + ) + parser.add_argument( + "--dataset", + help="dataset whose configuration file will be used", + default="glove-100-inner" + ) + parser.add_argument( + "--dataset-path", + help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data") ) parser.add_argument( "--build", @@ -121,15 +131,23 @@ def main(): search = args.search # Read configuration file associated to dataset - conf_filepath = args.configuration + if args.configuration: + conf_filepath = args.configuration + else: + conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json") conf_filename = conf_filepath.split("/")[-1] conf_filedir = "/".join(conf_filepath.split("/")[:-1]) + dataset_name = conf_filename.replace(".json", "") + dataset_path = os.path.join(args.dataset_path, dataset_name) if not os.path.exists(conf_filepath): raise FileNotFoundError(conf_filename) with open(conf_filepath, "r") as f: conf_file = json.load(f) + # Replace base, query to dataset-path + conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin") + conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin") # Ensure base and query files exist for dataset if not os.path.exists(conf_file["dataset"]["base_file"]): raise FileNotFoundError(conf_file["dataset"]["base_file"]) @@ -175,6 +193,14 @@ def main(): executables_to_run[executable_path] = {"index": []} executables_to_run[executable_path]["index"].append(index) + # Replace build, search to dataset path + for executable_path in executables_to_run: + for pos, index in enumerate(executables_to_run[executable_path]["index"]): + index["file"] = os.path.join(dataset_path, "index", index["name"]) + index["search_result_file"] = \ + os.path.join(dataset_path, "result", index["name"]) + executables_to_run[executable_path]["index"][pos] = index + run_build_and_search(conf_filename, conf_file, executables_to_run, args.force, conf_filedir, build, search) diff --git a/scripts/ann-benchmarks/split_groundtruth.py b/bench/ann/split_groundtruth.py similarity index 100% rename from scripts/ann-benchmarks/split_groundtruth.py rename to bench/ann/split_groundtruth.py diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 278d8c4d5a..d62404b16f 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -34,6 +34,7 @@ dependencies: - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 +- pyyaml - scikit-build>=0.13.1 - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 583ff29201..ee04c886d7 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -170,6 +170,7 @@ dependencies: - libfaiss>=1.7.1 - faiss-proc=*=cuda - matplotlib + - pyyaml cudatoolkit: specific: diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 6fd7523c64..88dac9fabd 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -9,7 +9,10 @@ The easiest way to install these benchmarks is through conda. We suggest using m mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml conda activate raft_ann_benchmarks -mamba install -c rapidsai libraft-ann-bench +mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-ann-bench cudatoolkit=11.8* + +git clone https://github.com/rapidsai/raft.git && cd raft +export RAFT_HOME=$(pwd) ``` The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly benchmarks are desired. @@ -35,16 +38,16 @@ export RAFT_HOME=$(pwd) # All scripts are present in directory raft/scripts/ann-benchmarks # (1) prepare dataset -python scripts/ann-benchmarks/get_dataset.py --name glove-100-angular --normalize +python scripts/ann-benchmarks/get_dataset.py --dataset glove-100-angular --normalize # (2) build and search index -python scripts/ann-benchmarks/run.py --configuration conf/glove-100-inner.json +python scripts/ann-benchmarks/run.py --configuration bench/ann/conf/glove-100-inner.json # (3) evaluate results -python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/glove-100-inner/groundtruth.neighbors.ibin result/glove-100-inner/ +python scripts/ann-benchmarks/data_export.py --output out.csv --dataset glove-100-inner result/glove-100-inner/ # (4) plot results -python scripts/ann-benchmarks/plot.py --result_csv out.csv +python scripts/ann-benchmarks/plot.py --result-csv out.csv ``` ### End-to-end example: Billion-scale @@ -62,17 +65,17 @@ mkdir -p data/deep-1B # (1) prepare dataset # download manually "Ground Truth" file of "Yandex DEEP" # suppose the file name is deep_new_groundtruth.public.10K.bin -python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin +python scripts/ann-benchmarks/split_groundtruth.py --groundtruth data/deep-1B/deep_new_groundtruth.public.10K.bin # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced # (2) build and search index -python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json +python scripts/ann-benchmarks/run.py --configuration bench/ann/conf/deep-1B.json # (3) evaluate results -python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/ +python scripts/ann-benchmarks/data_export.py --output out.csv --dataset deep-1B result/deep-1B/ # (4) plot results -python scripts/ann-benchmarks/plot.py --result_csv out.csv +python scripts/ann-benchmarks/plot.py --result-csv out.csv ``` The usage of `scripts/ann-benchmarks/split-groundtruth.py` is: @@ -92,18 +95,18 @@ script. For more information on [datasets and formats](ann_benchmarks_dataset.md The usage of this script is: ```bash -usage: get_dataset.py [-h] [--name NAME] [--path PATH] [--normalize] +usage: get_dataset.py [-h] [--name NAME] [--dataset-path DATASET_PATH] [--normalize] options: - -h, --help show this help message and exit - --name NAME dataset to download (default: glove-100-angular) - --path PATH path to download dataset (default: {os.getcwd()}/data) - --normalize normalize cosine distance to inner product (default: False) -``` + -h, --help show this help message and exit + --dataset DATASET dataset to download (default: glove-100-angular) + --dataset-path DATASET_PATH + path to download dataset (default: ${RAFT_HOME}/bench/ann/data) + --normalize normalize cosine distance to inner product (default: False) When option `normalize` is provided to the script, any dataset that has cosine distances will be normalized to inner product. So, for example, the dataset `glove-100-angular` -will be written at location `data/glove-100-inner/`. +will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`. #### Step 2: Build and Search Index The script `scripts/ann-benchmarks/run.py` will build and search indices for a given dataset and its @@ -122,19 +125,34 @@ available in `raft/cpp/build/`. The usage of the script `scripts/run.py` is: ```bash -usage: run.py [-h] --configuration CONFIGURATION [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [--force] +usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] + +options: +usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] options: -h, --help show this help message and exit --configuration CONFIGURATION path to configuration file for a dataset (default: None) + --dataset DATASET dataset whose configuration file will be used (default: glove-100-inner) + --dataset-path DATASET_PATH + path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) --build --search --algorithms ALGORITHMS run only comma separated list of named algorithms (default: None) --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) - --force re-run algorithms even if their results already exist (default: False) + -f, --force re-run algorithms even if their results already exist (default: False) ``` +`configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset. +The configuration file should be name as `.json`. It is optional if the name of the dataset is +provided with the `dataset` argument, in which case +a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json` + +`dataset-path` : +1. data is read from `/` +2. indices are built in `//index` +3. search results are stored in `//result` `build` and `search` : if both parameters are not supplied to the script then it is assumed both are `True`. @@ -149,17 +167,18 @@ will be combined and written to a CSV file. The usage of this script is: ```bash -usage: data_export.py [-h] --output OUTPUT [--recompute] --groundtruth GROUNDTRUTH +usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH] options: -h, --help show this help message and exit --output OUTPUT Path to the CSV output file (default: None) --recompute Recompute metrics (default: False) - --groundtruth GROUNDTRUTH - Path to groundtruth.neighbors.ibin file for a dataset (default: None) + --dataset DATASET Name of the dataset to export results for (default: glove-100-inner) + --dataset-path DATASET_PATH + path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) ``` -`result_filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example) +`result-filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example) #### Step 4: Plot Results The script `scripts/ann-benchmarks/plot.py` will plot all results evaluated to a CSV file for a given dataset. @@ -170,9 +189,9 @@ usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE options: -h, --help show this help message and exit - --result_csv RESULT_CSV + --result-csv RESULT_CSV Path to CSV Results (default: None) - --output OUTPUT Path to the PNG output file (default: /home/nfs/dgala/raft/out.png) + --output OUTPUT Path to the PNG output file (default: ${RAFT_HOME}/out.png) --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear)