diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py deleted file mode 100644 index 87ca330ed9..0000000000 --- a/bench/ann/data_export.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import subprocess -import json - -from pathlib import Path - -def parse_filepaths(fs): - for p in fs: - if p.endswith(".json") and os.path.exists(p): - yield p - else: - for f in Path(p).rglob('*.json'): - yield f.as_posix() - -def export_results(output_filepath, recompute, groundtruth_filepath, - result_filepath): - print(f"Writing output file to: {output_filepath}") - - parsed_filepaths = parse_filepaths(result_filepaths) - - with open(output_filepath, 'w') as out: - out.write("Algo,Recall,QPS\n") - - for fp in parsed_filepaths: - with open(fp, 'r') as f: - data = json.load(f) - for benchmark_case in data["benchmarks"]: - algo = benchmark_case["name"] - recall = benchmark_case["Recall"] - qps = benchmark_case["items_per_second"] - out.write(f"{algo},{recall},{qps}\n") - - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--output", help="Path to the CSV output file", - required=True) - parser.add_argument("--recompute", action="store_true", - help="Recompute metrics") - parser.add_argument("--dataset", - help="Name of the dataset to export results for", - default="glove-100-inner") - parser.add_argument( - "--dataset-path", - help="path to dataset folder", - default=os.path.join(os.getenv("RAFT_HOME"), - "bench", "ann", "data") - ) - - args, result_filepaths = parser.parse_known_args() - - # if nothing is provided - if len(result_filepaths) == 0: - raise ValueError("No filepaths to results were provided") - - groundtruth_filepath = os.path.join(args.dataset_path, args.dataset, - "groundtruth.neighbors.ibin") - export_results(args.output, args.recompute, groundtruth_filepath, - result_filepath) - - -if __name__ == "__main__": - main() diff --git a/bench/ann/plot.py b/bench/ann/plot.py index 0020e398a9..33a1872fe0 100644 --- a/bench/ann/plot.py +++ b/bench/ann/plot.py @@ -192,25 +192,38 @@ def inv_fun(x): plt.close() -def load_all_results(result_filepath): +def load_all_results(dataset_path): results = dict() - with open(result_filepath, 'r') as f: - for line in f.readlines()[1:]: - split_lines = line.split(',') - algo_name = split_lines[0].split('.')[0] - if algo_name not in results: - results[algo_name] = [] - results[algo_name].append([algo_name, float(split_lines[1]), - float(split_lines[2])]) + results_path = os.path.join(dataset_path, "result", "search") + for result_filepath in os.listdir(results_path): + with open(os.path.join(results_path, result_filepath), 'r') as f: + lines = f.readlines() + idx = 0 + for pos, line in enumerate(lines): + if "QPS" in line: + idx = pos + break + + for line in lines[idx+1:]: + split_lines = line.split(',') + algo_name = split_lines[0].split('.')[0].strip("\"") + if algo_name not in results: + results[algo_name] = [] + results[algo_name].append([algo_name, float(split_lines[12]), + float(split_lines[10])]) return results def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--result-csv", help="Path to CSV Results", required=True) - parser.add_argument("--output", help="Path to the PNG output file", - default=f"{os.getcwd()}/out.png") + parser.add_argument("--dataset", help="dataset to download", + default="glove-100-inner") + parser.add_argument("--dataset-path", help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data")) + parser.add_argument("--output-filename", + default="plot.png") parser.add_argument( "--x-scale", help="Scale to use when drawing the X-axis. \ @@ -228,12 +241,13 @@ def main(): ) args = parser.parse_args() - print(f"writing output to {args.output}") + output_filepath = os.path.join(args.dataset_path, args.dataset, args.output_filename) + print(f"writing output to {output_filepath}") - results = load_all_results(args.result_csv) + results = load_all_results(os.path.join(args.dataset_path, args.dataset)) linestyles = create_linestyles(sorted(results.keys())) - create_plot(results, args.raw, args.x_scale, args.y_scale, args.output, linestyles) + create_plot(results, args.raw, args.x_scale, args.y_scale, output_filepath, linestyles) if __name__ == "__main__": diff --git a/bench/ann/run.py b/bench/ann/run.py index 60b9a012ad..5c927d5066 100644 --- a/bench/ann/run.py +++ b/bench/ann/run.py @@ -41,49 +41,62 @@ def find_executable(algos_conf, algo): executable) build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable) if os.path.exists(conda_path): - return (executable, conda_path) + return (executable, conda_path, algo) elif os.path.exists(build_path): - return (executable, build_path) + return (executable, build_path, algo) else: raise FileNotFoundError(executable) -def run_build_and_search(conf_filename, conf_file, executables_to_run, - force, conf_filedir, build, search, k, batch_size): - for executable, ann_executable_path in executables_to_run.keys(): +def run_build_and_search(conf_file, conf_filename, conf_filedir, + executables_to_run, dataset_path, force, + build, search, k, batch_size): + for executable, ann_executable_path, algo in executables_to_run.keys(): # Need to write temporary configuration - temp_conf_filename = f"temporary_executable_{conf_filename}" + temp_conf_filename = f"temporary_{conf_filename}" temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename) with open(temp_conf_filepath, "w") as f: temp_conf = dict() temp_conf["dataset"] = conf_file["dataset"] temp_conf["search_basic_param"] = conf_file["search_basic_param"] - temp_conf["index"] = executables_to_run[(executable, - ann_executable_path)]["index"] + temp_conf["index"] = executables_to_run[(executable, + ann_executable_path, + algo)]["index"] json.dump(temp_conf, f) + legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result') + os.makedirs(legacy_result_folder, exist_ok=True) if build: + build_folder = os.path.join(legacy_result_folder, "build") + os.makedirs(build_folder, exist_ok=True) + cmd = [ann_executable_path, + "--build", + "--data_prefix="+dataset_path, + "--benchmark_out_format=csv", + f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"] if force: - p = subprocess.Popen([ann_executable_path, "--build", "--overwrite", - temp_conf_filepath]) - p.wait() - else: - p = subprocess.Popen([ann_executable_path, "--build", - temp_conf_filepath]) - p.wait() + cmd = cmd + ["--overwrite"] + cmd = cmd + [temp_conf_filepath] + print(cmd) + p = subprocess.Popen(cmd) + p.wait() if search: - legacy_result_folder = "result/" + temp_conf["dataset"]["name"] - os.makedirs(legacy_result_folder, exist_ok=True) - p = subprocess.Popen([ - ann_executable_path, - "--search", - "--benchmark_counters_tabular", - "--benchmark_out_format=json", - "--override_kv=k:%s" % k, - "--override_kv=n_queries:%s" % batch_size, - f"--benchmark_out={legacy_result_folder}/{executable}.json", - temp_conf_filepath]) + search_folder = os.path.join(legacy_result_folder, "search") + os.makedirs(search_folder, exist_ok=True) + cmd = [ann_executable_path, + "--search", + "--data_prefix="+dataset_path, + "--benchmark_counters_tabular", + "--override_kv=k:%s" % k, + "--override_kv=n_queries:%s" % batch_size, + "--benchmark_out_format=csv", + f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"] + if force: + cmd = cmd + ["--overwrite"] + cmd = cmd + [temp_conf_filepath] + print(cmd) + p = subprocess.Popen(cmd) p.wait() os.remove(temp_conf_filepath) @@ -157,28 +170,24 @@ def main(): # Read configuration file associated to dataset if args.configuration: conf_filepath = args.configuration + elif args.dataset: + conf_filepath = \ + os.path.join(scripts_path, "conf", f"{args.dataset}.json") else: - conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json") + raise ValueError("One of parameters `configuration` or \ + `dataset` need to be provided") conf_filename = conf_filepath.split("/")[-1] conf_filedir = "/".join(conf_filepath.split("/")[:-1]) dataset_name = conf_filename.replace(".json", "") - dataset_path = os.path.realpath(os.path.join(args.dataset_path, dataset_name)) + dataset_path = args.dataset_path if not os.path.exists(conf_filepath): raise FileNotFoundError(conf_filename) + if not os.path.exists(os.path.join(args.dataset_path, dataset_name)): + raise FileNotFoundError(os.path.join(args.dataset_path, dataset_name)) with open(conf_filepath, "r") as f: conf_file = json.load(f) - # Replace base, query to dataset-path - conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin") - conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin") - conf_file["dataset"]["groundtruth_neighbors_file"] = os.path.join(dataset_path, "groundtruth.neighbors.ibin") - # Ensure base and query files exist for dataset - if not os.path.exists(conf_file["dataset"]["base_file"]): - raise FileNotFoundError(conf_file["dataset"]["base_file"]) - if not os.path.exists(conf_file["dataset"]["query_file"]): - raise FileNotFoundError(conf_file["dataset"]["query_file"]) - executables_to_run = dict() # At least one named index should exist in config file if args.indices: @@ -218,16 +227,16 @@ def main(): executables_to_run[executable_path] = {"index": []} executables_to_run[executable_path]["index"].append(index) - # Replace build, search to dataset path + # Replace index to dataset path for executable_path in executables_to_run: for pos, index in enumerate(executables_to_run[executable_path]["index"]): - index["file"] = os.path.join(dataset_path, "index", index["name"]) - index["search_result_file"] = \ - os.path.join(dataset_path, "result", index["name"]) + index["file"] = os.path.join(dataset_path, dataset_name, "index", index["name"]) executables_to_run[executable_path]["index"][pos] = index - run_build_and_search(conf_filename, conf_file, executables_to_run, - args.force, conf_filedir, build, search, k, batch_size) + run_build_and_search(conf_file, conf_filename, conf_filedir, + executables_to_run, dataset_path, + args.force, build, search, + k, batch_size) if __name__ == "__main__": diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp index 46e3bec5bb..555f14f1bb 100644 --- a/cpp/bench/ann/src/common/benchmark.cpp +++ b/cpp/bench/ann/src/common/benchmark.cpp @@ -52,8 +52,17 @@ auto load_lib(const std::string& algo) -> void* auto found = libs.find(algo); if (found != libs.end()) { return found->second.handle; } - auto lib_name = "lib" + algo + "_ann_bench.so"; - return libs.emplace(algo, lib_name).first->second.handle; + auto lib_name = "lib" + algo + "_ann_bench.so"; + std::string lib_path = ""; + if (std::getenv("CONDA_PREFIX") != nullptr) { + auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/"; + if (std::filesystem::exists(conda_path + "ANN_BENCH")) { lib_path = conda_path; } + } + if (std::getenv("RAFT_HOME") != nullptr) { + auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/"; + if (std::filesystem::exists(build_path + "ANN_BENCH")) { lib_path = build_path; } + } + return libs.emplace(algo, lib_path + lib_name).first->second.handle; } auto get_fun_name(void* addr) -> std::string diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 67e04ff518..757e9a59b5 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -21,11 +21,10 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm ## Running the benchmarks ### Usage -There are 4 general steps to running the benchmarks and vizualizing the results: +There are 3 general steps to running the benchmarks and vizualizing the results: 1. Prepare Dataset 2. Build Index and Search Index -3. Evaluate Results -4. Plot Results +3. Plot Results We provide a collection of lightweight Python scripts that are wrappers over lower level scripts and executables to run our benchmarks. Either Python scripts or @@ -47,11 +46,8 @@ python bench/ann/get_dataset.py --dataset deep-image-96-angular --normalize # (2) build and search index python bench/ann/run.py --dataset deep-image-96-inner -# (3) evaluate results -python bench/ann/data_export.py --output out.csv --dataset deep-image-96-inner - -# (4) plot results -python bench/ann/plot.py --result-csv out.csv +# (3) plot results +python bench/ann/plot.py --dataset deep-image-96-inner ``` Configuration files already exist for the following list of the million-scale datasets. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `bench/ann/conf`. @@ -86,11 +82,8 @@ python bench/ann/split_groundtruth.py --groundtruth bench/ann/data/deep-1B/deep_ # (2) build and search index python bench/ann/run.py --dataset deep-1B -# (3) evaluate results -python bench/ann/data_export.py --output out.csv --dataset deep-1B - -# (4) plot results -python bench/ann/plot.py --result-csv out.csv +# (3) plot results +python bench/ann/plot.py --dataset deep-1B ``` The usage of `bench/ann/split-groundtruth.py` is: @@ -119,6 +112,7 @@ options: path to download dataset (default: ${RAFT_HOME}/bench/ann/data) --normalize normalize cosine distance to inner product (default: False) ``` + When option `normalize` is provided to the script, any dataset that has cosine distances will be normalized to inner product. So, for example, the dataset `glove-100-angular` will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`. @@ -140,13 +134,15 @@ available in `raft/cpp/build/`. The usage of the script `bench/ann/run.py` is: ```bash -usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] - -options: -usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] +usage: run.py [-h] [-k COUNT] [-bs BATCH_SIZE] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] + [-f] options: -h, --help show this help message and exit + -k COUNT, --count COUNT + the number of nearest neighbors to search for (default: 10) + -bs BATCH_SIZE, --batch-size BATCH_SIZE + number of query vectors to use in each query trial (default: 10000) --configuration CONFIGURATION path to configuration file for a dataset (default: None) --dataset DATASET dataset whose configuration file will be used (default: glove-100-inner) @@ -157,14 +153,15 @@ options: --algorithms ALGORITHMS run only comma separated list of named algorithms (default: None) --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) - -k, --count number of nearest neighbors to return - --batch-size number of query vectors to pass into search -f, --force re-run algorithms even if their results already exist (default: False) ``` + `configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset. The configuration file should be name as `.json`. It is optional if the name of the dataset is provided with the `dataset` argument, in which case -a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json` +a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json`. +For every algorithm run by this script, it outputs an index build statistics CSV file in `/result/build/ +and an index search statistics CSV file in `/result/search/. `dataset-path` : 1. data is read from `/` @@ -177,45 +174,26 @@ it is assumed both are `True`. `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index is available in `algos.yaml` and not disabled, as well as having an associated executable. -#### Step 3: Evaluating Results -The script `bench/ann/data_export.py` will evaluate results for a dataset whose index has been built -and searched with at least one algorithm. For every result file that is available to the script, the output -will be combined and written to a CSV file. +#### Step 3: Plot Results +The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics +CSV file in `/search/result/. The usage of this script is: ```bash -usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH] +usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filename OUTPUT_FILENAME] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] options: -h, --help show this help message and exit - --output OUTPUT Path to the CSV output file (default: None) - --recompute Recompute metrics (default: False) - --dataset DATASET Name of the dataset to export results for (default: glove-100-inner) + --dataset DATASET dataset to download (default: glove-100-inner) --dataset-path DATASET_PATH path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) -``` - -#### Step 4: Plot Results -The script `bench/ann/plot.py` will plot all results evaluated to a CSV file for a given dataset. - -The usage of this script is: -```bash -usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] - -options: - -h, --help show this help message and exit - --result-csv RESULT_CSV - Path to CSV Results (default: None) - --output OUTPUT Path to the PNG output file (default: ${RAFT_HOME}/out.png) + --output-filename OUTPUT_FILENAME --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) --raw Show raw results (not just Pareto frontier) in faded colours (default: False) ``` -All algorithms present in the CSV file supplied to this script with parameter `result_csv` -will appear in the plot. - The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall. ![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png)