From 720606bd7574a64577b021de2dfd3245e0bc5cb1 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 29 Aug 2023 11:11:04 -0700 Subject: [PATCH 1/4] revert dlopen path logic --- cpp/bench/ann/src/common/benchmark.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp index 555f14f1bb..46e3bec5bb 100644 --- a/cpp/bench/ann/src/common/benchmark.cpp +++ b/cpp/bench/ann/src/common/benchmark.cpp @@ -52,17 +52,8 @@ auto load_lib(const std::string& algo) -> void* auto found = libs.find(algo); if (found != libs.end()) { return found->second.handle; } - auto lib_name = "lib" + algo + "_ann_bench.so"; - std::string lib_path = ""; - if (std::getenv("CONDA_PREFIX") != nullptr) { - auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/"; - if (std::filesystem::exists(conda_path + "ANN_BENCH")) { lib_path = conda_path; } - } - if (std::getenv("RAFT_HOME") != nullptr) { - auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/"; - if (std::filesystem::exists(build_path + "ANN_BENCH")) { lib_path = build_path; } - } - return libs.emplace(algo, lib_path + lib_name).first->second.handle; + auto lib_name = "lib" + algo + "_ann_bench.so"; + return libs.emplace(algo, lib_name).first->second.handle; } auto get_fun_name(void* addr) -> std::string From b869e9e7f0b75886c0a8cf928a3c2f8c46b7b747 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 29 Aug 2023 11:12:06 -0700 Subject: [PATCH 2/4] add pandas, minor docs update --- .../environments/bench_ann_cuda-118_arch-x86_64.yaml | 1 + dependencies.yaml | 1 + docs/source/raft_ann_benchmarks.md | 11 ++++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 37a4042aac..5eab55be13 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -35,6 +35,7 @@ dependencies: - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 +- pandas - pyyaml - scikit-build>=0.13.1 - sysroot_linux-64==2.17 diff --git a/dependencies.yaml b/dependencies.yaml index 9a0807143c..05f5e5e2ce 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -171,6 +171,7 @@ dependencies: - benchmark>=1.8.2 - faiss-proc=*=cuda - matplotlib + - pandas - pyyaml cudatoolkit: diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 29187e77e2..8d3b996abb 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -180,7 +180,8 @@ CSV file in `/search/result/. The usage of this script is: ```bash -usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] +usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] + [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] options: -h, --help show this help message and exit @@ -189,6 +190,14 @@ options: path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) --output-filepath OUTPUT_FILEPATH directory for PNG to be saved (default: os.getcwd()) + --algorithms ALGORITHMS + plot only comma separated list of named algorithms (default: None) + -k COUNT, --count COUNT + the number of nearest neighbors to search for (default: 10) + -bs BATCH_SIZE, --batch-size BATCH_SIZE + number of query vectors to use in each query trial (default: 10000) + --build + --search --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) From 9643327726349d23795bab932922ca64c24815da Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 29 Aug 2023 12:02:01 -0700 Subject: [PATCH 3/4] add data_export.py to convert json to csv --- bench/ann/data_export.py | 65 ++++++++++++++++++++++++++ bench/ann/plot.py | 74 +++++++++++++----------------- bench/ann/run.py | 12 ++--- docs/source/raft_ann_benchmarks.md | 26 +++++++++-- 4 files changed, 123 insertions(+), 54 deletions(-) create mode 100644 bench/ann/data_export.py diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py new file mode 100644 index 0000000000..8e04277123 --- /dev/null +++ b/bench/ann/data_export.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import pandas as pd +import os +import json + + +def read_file(dataset, dataset_path, method): + dir = os.path.join(dataset_path, dataset, "result", method) + for file in os.listdir(dir): + if file.endswith(".json"): + with open(os.path.join(dir, file), "r") as f: + data = json.load(f) + df = pd.DataFrame(data["benchmarks"]) + yield (os.path.join(dir, file), file.split('-')[0], df) + +def convert_json_to_csv_build(dataset, dataset_path): + for file, algo_name, df in read_file(dataset, dataset_path, "build"): + df['name'] = df['name'].str.split('/').str[0] + write = pd.DataFrame({'algo_name' : [algo_name] * len(df), + 'index_name' : df['name'], + 'time' : df['real_time']}) + write.to_csv(file.replace('.json', '.csv'), index=False) + + +def convert_json_to_csv_search(dataset, dataset_path): + for file, algo_name, df in read_file(dataset, dataset_path, "search"): + df['name'] = df['name'].str.split('/').str[0] + write = pd.DataFrame({'algo_name' : [algo_name] * len(df), + 'index_name' : df['name'], + 'recall' : df['Recall'], + 'qps' : df['items_per_second']}) + write.to_csv(file.replace('.json', '.csv'), index=False) + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--dataset", help="dataset to download", + default="glove-100-inner") + parser.add_argument("--dataset-path", help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data")) + args = parser.parse_args() + convert_json_to_csv_build(args.dataset, args.dataset_path) + convert_json_to_csv_search(args.dataset, args.dataset_path) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bench/ann/plot.py b/bench/ann/plot.py index a731516013..ff7cb29b4a 100644 --- a/bench/ann/plot.py +++ b/bench/ann/plot.py @@ -203,18 +203,18 @@ def create_plot_build(build_results, search_results, linestyles, fn_out, xn = "k-nn" yn = "qps" - recall_85 = [-1] * len(linestyles) qps_85 = [-1] * len(linestyles) bt_85 = [0] * len(linestyles) i_85 = [-1] * len(linestyles) - recall_90 = [-1] * len(linestyles) + qps_90 = [-1] * len(linestyles) bt_90 = [0] * len(linestyles) i_90 = [-1] * len(linestyles) - recall_95 = [-1] * len(linestyles) + qps_95 = [-1] * len(linestyles) bt_95 = [0] * len(linestyles) i_95 = [-1] * len(linestyles) + data = OrderedDict() colors = OrderedDict() @@ -248,7 +248,7 @@ def mean_y(algo): plt.figure(figsize=(12, 9)) ax = df.plot.bar(rot=0, color=colors) fig = ax.get_figure() - print(f"writing search output to {fn_out}") + print(f"writing build output to {fn_out}") plt.title("Build Time for Highest QPS") plt.suptitle(f"{dataset} k={k} batch_size={batch_size}") plt.ylabel("Build Time (s)") @@ -258,45 +258,33 @@ def mean_y(algo): def load_lines(results_path, result_files, method, index_key): results = dict() - linebreaker = "name,iterations" - for result_filename in result_files: - with open(os.path.join(results_path, result_filename), 'r') as f: - lines = f.readlines() - lines = lines[:-1] if lines[-1] == "\n" else lines - idx = 0 - for pos, line in enumerate(lines): - if linebreaker in line: - idx = pos - break - - if method == "build": - if "hnswlib" in result_filename: + if result_filename.endswith('.csv'): + with open(os.path.join(results_path, result_filename), 'r') as f: + lines = f.readlines() + lines = lines[:-1] if lines[-1] == "\n" else lines + + if method == "build": key_idx = [2] - else: - key_idx = [10] - elif method == "search": - if "hnswlib" in result_filename: - key_idx = [10, 6] - else: - key_idx = [12, 10] - - for line in lines[idx+1:]: - split_lines = line.split(',') - - algo_name = split_lines[0].split('.')[0].strip("\"") - index_name = split_lines[0].split('/')[0].strip("\"") - - if index_key == "algo": - dict_key = algo_name - elif index_key == "index": - dict_key = (algo_name, index_name) - if dict_key not in results: - results[dict_key] = [] - to_add = [algo_name, index_name] - for key_i in key_idx: - to_add.append(float(split_lines[key_i])) - results[dict_key].append(to_add) + elif method == "search": + key_idx = [2, 3] + + for line in lines[1:]: + split_lines = line.split(',') + + algo_name = split_lines[0] + index_name = split_lines[1] + + if index_key == "algo": + dict_key = algo_name + elif index_key == "index": + dict_key = (algo_name, index_name) + if dict_key not in results: + results[dict_key] = [] + to_add = [algo_name, index_name] + for key_i in key_idx: + to_add.append(float(split_lines[key_i])) + results[dict_key].append(to_add) return results @@ -375,8 +363,8 @@ def main(): build = args.build search = args.search - search_output_filepath = os.path.join(args.output_filepath, f"search-{args.dataset}-{k}-{batch_size}.png") - build_output_filepath = os.path.join(args.output_filepath, f"build-{args.dataset}-{k}-{batch_size}.png") + search_output_filepath = os.path.join(args.output_filepath, f"search-{args.dataset}-k{k}-batch_size{batch_size}.png") + build_output_filepath = os.path.join(args.output_filepath, f"build-{args.dataset}-k{k}-batch_size{batch_size}.png") search_results = load_all_results( os.path.join(args.dataset_path, args.dataset), diff --git a/bench/ann/run.py b/bench/ann/run.py index 5cac54506f..2da966cbcd 100644 --- a/bench/ann/run.py +++ b/bench/ann/run.py @@ -41,9 +41,9 @@ def find_executable(algos_conf, algo, k, batch_size): executable) build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable) if os.path.exists(conda_path): - return (executable, conda_path, f"{algo}-{k}-{batch_size}") + return (executable, conda_path, f"{algo}-k{k}-batch_size{batch_size}") elif os.path.exists(build_path): - return (executable, build_path, f"{algo}-{k}-{batch_size}") + return (executable, build_path, f"{algo}-k{k}-batch_size{batch_size}") else: raise FileNotFoundError(executable) @@ -72,8 +72,8 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir, cmd = [ann_executable_path, "--build", "--data_prefix="+dataset_path, - "--benchmark_out_format=csv", - f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"] + "--benchmark_out_format=json", + f"--benchmark_out={os.path.join(build_folder, f'{algo}.json')}"] if force: cmd = cmd + ["--overwrite"] cmd = cmd + [temp_conf_filepath] @@ -90,8 +90,8 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir, "--benchmark_counters_tabular", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, - "--benchmark_out_format=csv", - f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"] + "--benchmark_out_format=json", + f"--benchmark_out={os.path.join(search_folder, f'{algo}.json')}"] if force: cmd = cmd + ["--overwrite"] cmd = cmd + [temp_conf_filepath] diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 8d3b996abb..061cdf92ab 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -160,13 +160,13 @@ options: The configuration file should be name as `.json`. It is optional if the name of the dataset is provided with the `dataset` argument, in which case a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json`. -For every algorithm run by this script, it outputs an index build statistics CSV file in `/result/build/ -and an index search statistics CSV file in `/result/search/. +For every algorithm run by this script, it outputs an index build statistics JSON file in `/result/build/` +and an index search statistics JSON file in `/result/search/`. `dataset-path` : 1. data is read from `/` 2. indices are built in `//index` -3. search results are stored in `//result` +3. build/search results are stored in `//result` `build` and `search` : if both parameters are not supplied to the script then it is assumed both are `True`. @@ -174,9 +174,25 @@ it is assumed both are `True`. `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index is available in `algos.yaml` and not disabled, as well as having an associated executable. -#### Step 3: Plot Results +#### Step 3: Data Export +The script `bench/ann/data_export.py` will convert the intermediate JSON outputs produced by `bench/ann/run.py` to more +easily readable CSV files, which are needed to build charts made by `bench/ann/plot.py`. + +```bash +usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] + +options: + -h, --help show this help message and exit + --dataset DATASET dataset to download (default: glove-100-inner) + --dataset-path DATASET_PATH + path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) +``` +Build statistics CSV file is stored in `/result/build/` +and index search statistics CSV file in `/result/search/`. + +#### Step 4: Plot Results The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics -CSV file in `/search/result/. +CSV file in `/result/search/<-k{k}-batch_size{batch_size}>.csv`. The usage of this script is: ```bash From ef7b7f9262819a8044c17ad657597bb868db68e3 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 29 Aug 2023 12:05:22 -0700 Subject: [PATCH 4/4] update docs --- bench/ann/data_export.py | 2 +- docs/source/raft_ann_benchmarks.md | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py index 8e04277123..33304bc276 100644 --- a/bench/ann/data_export.py +++ b/bench/ann/data_export.py @@ -62,4 +62,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 061cdf92ab..aae4a07100 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -21,10 +21,11 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm ## Running the benchmarks ### Usage -There are 3 general steps to running the benchmarks and vizualizing the results: +There are 4 general steps to running the benchmarks and vizualizing the results: 1. Prepare Dataset 2. Build Index and Search Index -3. Plot Results +3. Data Export +4. Plot Results We provide a collection of lightweight Python scripts that are wrappers over lower level scripts and executables to run our benchmarks. Either Python scripts or @@ -46,7 +47,10 @@ python bench/ann/get_dataset.py --dataset deep-image-96-angular --normalize # (2) build and search index python bench/ann/run.py --dataset deep-image-96-inner -# (3) plot results +# (3) export data +python bench/ann/data_export.py --dataset deep-image-96-inner + +# (4) plot results python bench/ann/plot.py --dataset deep-image-96-inner ``` @@ -82,7 +86,10 @@ python bench/ann/split_groundtruth.py --groundtruth bench/ann/data/deep-1B/deep_ # (2) build and search index python bench/ann/run.py --dataset deep-1B -# (3) plot results +# (3) export data +python bench/ann/data_export.py --dataset deep-1B + +# (4) plot results python bench/ann/plot.py --dataset deep-1B ```