From 291788697af6fbe0eabbd7136ee1554809f7072a Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 11 Aug 2023 17:35:28 -0700 Subject: [PATCH 1/5] try to run gbench executable --- bench/ann/algos.yaml | 16 +-- bench/ann/run.py | 173 ++++++++++++++----------- cpp/bench/ann/CMakeLists.txt | 19 ++- cpp/bench/ann/src/common/benchmark.cpp | 15 ++- 4 files changed, 126 insertions(+), 97 deletions(-) diff --git a/bench/ann/algos.yaml b/bench/ann/algos.yaml index 5f554fc46b..46d3c9e801 100644 --- a/bench/ann/algos.yaml +++ b/bench/ann/algos.yaml @@ -1,30 +1,18 @@ faiss_gpu_ivf_flat: - executable: FAISS_IVF_FLAT_ANN_BENCH - disabled: false -faiss_gpu_flat: - executable: FAISS_IVF_FLAT_ANN_BENCH disabled: false faiss_gpu_ivf_pq: - executable: FAISS_IVF_PQ_ANN_BENCH disabled: false faiss_gpu_ivf_sq: - executable: FAISS_IVF_PQ_ANN_BENCH disabled: false -faiss_gpu_bfknn: - executable: FAISS_BFKNN_ANN_BENCH +faiss_gpu_flat: disabled: false raft_ivf_flat: - executable: RAFT_IVF_FLAT_ANN_BENCH disabled: false raft_ivf_pq: - executable: RAFT_IVF_PQ_ANN_BENCH disabled: false raft_cagra: - executable: RAFT_CAGRA_ANN_BENCH disabled: false ggnn: - executable: GGNN_ANN_BENCH disabled: false hnswlib: - executable: HNSWLIB_ANN_BENCH - disabled: false \ No newline at end of file + disabled: false diff --git a/bench/ann/run.py b/bench/ann/run.py index ebaef1e004..90175f7433 100644 --- a/bench/ann/run.py +++ b/bench/ann/run.py @@ -25,54 +25,50 @@ def validate_algorithm(algos_conf, algo): return algo in algos_conf_keys and not algos_conf[algo]["disabled"] -def find_executable(algos_conf, algo): - executable = algos_conf[algo]["executable"] +def find_executable(): + executable = "ANN_BENCH" conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann", executable) build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable) if os.path.exists(conda_path): - return (executable, conda_path) + return conda_path elif os.path.exists(build_path): - return (executable, build_path) + return build_path else: raise FileNotFoundError(executable) -def run_build_and_search(conf_filename, conf_file, executables_to_run, +def run_build_and_search(conf_filename, conf_file, dataset_path, force, conf_filedir, build, search): - for executable, ann_executable_path in executables_to_run.keys(): - # Need to write temporary configuration - temp_conf_filename = f"temporary_executable_{conf_filename}" - temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename) - with open(temp_conf_filepath, "w") as f: - temp_conf = dict() - temp_conf["dataset"] = conf_file["dataset"] - temp_conf["search_basic_param"] = conf_file["search_basic_param"] - temp_conf["index"] = executables_to_run[(executable, - ann_executable_path)]["index"] - json.dump(temp_conf, f) - - if build: - if force: - p = subprocess.Popen([ann_executable_path, "-b", "-f", - temp_conf_filepath]) - p.wait() - else: - p = subprocess.Popen([ann_executable_path, "-b", - temp_conf_filepath]) - p.wait() - - if search: - if force: - p = subprocess.Popen([ann_executable_path, "-s", "-f", - temp_conf_filepath]) - p.wait() - else: - p = subprocess.Popen([ann_executable_path, "-s", - temp_conf_filepath]) - p.wait() - - os.remove(temp_conf_filepath) + ann_executable_path = find_executable() + + # Need to write temporary configuration + temp_conf_filename = f"temporary_{conf_filename}" + temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename) + with open(temp_conf_filepath, "w") as f: + json.dump(conf_file, f) + + data_prefix = "/".join(dataset_path.split("/")[:-1]) + if build: + cmd = [ann_executable_path, "--build", "--data_prefix="+data_prefix] + if force: + cmd = cmd + ["--overwrite"] + cmd = cmd + [temp_conf_filepath] + print(cmd) + p = subprocess.Popen(cmd) + p.wait() + + if search: + cmd = [ann_executable_path, "--search", "--benchmark_out_format=csv", + "--benchmark_out=" + os.path.join(dataset_path, "result.csv"), + "--data_prefix=" + data_prefix] + if force: + cmd = cmd + ["--overwrite"] + cmd = cmd + [temp_conf_filepath] + p = subprocess.Popen(cmd) + p.wait() + + os.remove(temp_conf_filepath) def main(): @@ -90,7 +86,6 @@ def main(): parser.add_argument( "--dataset", help="dataset whose configuration file will be used", - default="glove-100-inner" ) parser.add_argument( "--dataset-path", @@ -118,6 +113,12 @@ def main(): help="re-run algorithms even if their results \ already exist", action="store_true") + parser.add_argument("--batch-size", + help="batch size for querying", + default=1) + parser.add_argument("--k", + help="k neighbors", + default=10) args = parser.parse_args() @@ -133,75 +134,93 @@ def main(): # Read configuration file associated to dataset if args.configuration: conf_filepath = args.configuration + elif args.dataset: + conf_filepath = \ + os.path.join(scripts_path, "conf", f"{args.dataset}.json") else: - conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json") + raise ValueError("One of parameters `configuration` or \ + `dataset` need to be provided") conf_filename = conf_filepath.split("/")[-1] conf_filedir = "/".join(conf_filepath.split("/")[:-1]) - dataset_name = conf_filename.replace(".json", "") - dataset_path = os.path.join(args.dataset_path, dataset_name) + dataset = conf_filename.replace(".json", "") + dataset_path = os.path.join(args.dataset_path, dataset) if not os.path.exists(conf_filepath): raise FileNotFoundError(conf_filename) + if not os.path.exists(dataset_path): + raise FileNotFoundError(dataset_path) with open(conf_filepath, "r") as f: conf_file = json.load(f) - # Replace base, query to dataset-path - conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin") - conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin") - # Ensure base and query files exist for dataset - if not os.path.exists(conf_file["dataset"]["base_file"]): - raise FileNotFoundError(conf_file["dataset"]["base_file"]) - if not os.path.exists(conf_file["dataset"]["query_file"]): - raise FileNotFoundError(conf_file["dataset"]["query_file"]) - - executables_to_run = dict() + # # Replace base, query, gr to dataset-path + # conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin") + # conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin") + # conf_file["dataset"]["groundtruth_neighbors_file"] = \ + # os.path.join(dataset_path, "groundtruth.neighbors.ibin") + # # Ensure base and query files exist for dataset + # if not os.path.exists(conf_file["dataset"]["base_file"]): + # raise FileNotFoundError(conf_file["dataset"]["base_file"]) + # if not os.path.exists(conf_file["dataset"]["query_file"]): + # raise FileNotFoundError(conf_file["dataset"]["query_file"]) + # if not os.path.exists(conf_file["dataset"]["groundtruth_neighbors_file"]): + # raise FileNotFoundError(conf_file["dataset"]["groundtruth_neighbors_file"]) + + # executables_to_run = dict() + indices_to_run = [] # At least one named index should exist in config file if args.indices: indices = set(args.indices.split(",")) # algo associated with index should still be present in algos.yaml # and enabled - for index in conf_file["index"]: + for pos, index in enumerate(conf_file["index"]): curr_algo = index["algo"] if index["name"] in indices and \ validate_algorithm(algos_conf, curr_algo): - executable_path = find_executable(algos_conf, curr_algo) - if executable_path not in executables_to_run: - executables_to_run[executable_path] = {"index": []} - executables_to_run[executable_path]["index"].append(index) + # executable_path = find_executable(algos_conf, curr_algo) + # if executable_path not in executables_to_run: + # executables_to_run[executable_path] = {"index": []} + # executables_to_run[executable_path]["index"].append(index) + indices_to_run.append(pos) # switch to named algorithms if indices parameter is not supplied elif args.algorithms: algorithms = set(args.algorithms.split(",")) # pick out algorithms from conf file that exist # and are enabled in algos.yaml - for index in conf_file["index"]: + for pos, index in enumerate(conf_file["index"]): curr_algo = index["algo"] if curr_algo in algorithms and \ validate_algorithm(algos_conf, curr_algo): - executable_path = find_executable(algos_conf, curr_algo) - if executable_path not in executables_to_run: - executables_to_run[executable_path] = {"index": []} - executables_to_run[executable_path]["index"].append(index) + # executable_path = find_executable(algos_conf, curr_algo) + # if executable_path not in executables_to_run: + # executables_to_run[executable_path] = {"index": []} + # executables_to_run[executable_path]["index"].append(index) + indices_to_run.append(pos) # default, try to run all available algorithms else: - for index in conf_file["index"]: + for pos, index in enumerate(conf_file["index"]): curr_algo = index["algo"] if validate_algorithm(algos_conf, curr_algo): - executable_path = find_executable(algos_conf, curr_algo) - if executable_path not in executables_to_run: - executables_to_run[executable_path] = {"index": []} - executables_to_run[executable_path]["index"].append(index) - - # Replace build, search to dataset path - for executable_path in executables_to_run: - for pos, index in enumerate(executables_to_run[executable_path]["index"]): - index["file"] = os.path.join(dataset_path, "index", index["name"]) - index["search_result_file"] = \ - os.path.join(dataset_path, "result", index["name"]) - executables_to_run[executable_path]["index"][pos] = index - - run_build_and_search(conf_filename, conf_file, executables_to_run, + # executable_path = find_executable(algos_conf, curr_algo) + # if executable_path not in executables_to_run: + # executables_to_run[executable_path] = {"index": []} + # executables_to_run[executable_path]["index"].append(index) + indices_to_run.append(pos) + + # filter available indices + if len(indices_to_run) == 0: + raise ValueError("No indices found to run") + conf_file["index"] = [conf_file["index"][i] for i in indices_to_run] + + # Replace index build to dataset path + for pos, index in enumerate(conf_file["index"]): + index["file"] = os.path.join(dataset_path, "index", index["name"]) + conf_file["index"][pos] = index + + print(conf_file) + + run_build_and_search(conf_filename, conf_file, dataset_path, args.force, conf_filedir, build, search) diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 2ccdead89a..6df4df082f 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -15,9 +15,10 @@ # ################################################################################################## # * compiler function ----------------------------------------------------------------------------- -option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_IVF_SQ "Include faiss' brute-force knn algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) @@ -183,18 +184,26 @@ endif() if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT) ConfigureAnnBench( - NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss ) endif() if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ) ConfigureAnnBench( - NAME FAISS_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss ) endif() -if(RAFT_ANN_BENCH_USE_FAISS_BFKNN) - ConfigureAnnBench(NAME FAISS_BFKNN PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss) +if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_SQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + ) endif() if(RAFT_ANN_BENCH_USE_GGNN) diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp index c73f2ed22a..cfffc36515 100644 --- a/cpp/bench/ann/src/common/benchmark.cpp +++ b/cpp/bench/ann/src/common/benchmark.cpp @@ -51,7 +51,20 @@ auto load_lib(const std::string& algo) -> void* if (found != libs.end()) { return found->second.handle; } auto lib_name = "lib" + algo + "_ann_bench.so"; - return libs.emplace(algo, lib_name).first->second.handle; + std::string lib_path = ""; + if (std::getenv("CONDA_PREFIX") != nullptr) { + auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/"; + if (std::filesystem::exists(conda_path + "ANN_BENCH")) { + lib_path = conda_path; + } + } + if (std::getenv("RAFT_HOME") != nullptr) { + auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/"; + if (std::filesystem::exists(build_path + "ANN_BENCH")) { + lib_path = build_path; + } + } + return libs.emplace(algo, lib_path + lib_name).first->second.handle; } auto get_fun_name(void* addr) -> std::string From f927f6927bbc1a1288617df5f00850e1d4c32e89 Mon Sep 17 00:00:00 2001 From: divyegala Date: Thu, 24 Aug 2023 14:18:45 -0700 Subject: [PATCH 2/5] compiling, index building successful, search failing --- bench/ann/algos.yaml | 3 +++ bench/ann/run.py | 22 ++++++++++++++-------- cpp/bench/ann/CMakeLists.txt | 19 +++++-------------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/bench/ann/algos.yaml b/bench/ann/algos.yaml index 16a6c1a895..5f554fc46b 100644 --- a/bench/ann/algos.yaml +++ b/bench/ann/algos.yaml @@ -10,6 +10,9 @@ faiss_gpu_ivf_pq: faiss_gpu_ivf_sq: executable: FAISS_IVF_PQ_ANN_BENCH disabled: false +faiss_gpu_bfknn: + executable: FAISS_BFKNN_ANN_BENCH + disabled: false raft_ivf_flat: executable: RAFT_IVF_FLAT_ANN_BENCH disabled: false diff --git a/bench/ann/run.py b/bench/ann/run.py index dad1ad7a62..e64148abd8 100644 --- a/bench/ann/run.py +++ b/bench/ann/run.py @@ -75,19 +75,19 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir, p.wait() if search: - # legacy_result_folder = "result/" + conf_file["dataset"]["name"] - # os.makedirs(legacy_result_folder, exist_ok=True) + legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result') + os.makedirs(legacy_result_folder, exist_ok=True) cmd = [ann_executable_path, "--search", "--data_prefix="+dataset_path, "--benchmark_counters_tabular", - "--benchmark_out_format=json", + "--benchmark_out_format=csv", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, - "--benchmark_out_format=csv", - f"--benchmark_out={os.path.join(dataset_path, 'result.csv')}"] + f"--benchmark_out={os.path.join(dataset_path, conf_file['dataset']['name'], 'result', f'{executable}.csv')}"] if force: cmd = cmd + ["--overwrite"] + cmd = cmd + [temp_conf_filepath] print(cmd) p = subprocess.Popen(cmd) p.wait() @@ -171,11 +171,11 @@ def main(): conf_filename = conf_filepath.split("/")[-1] conf_filedir = "/".join(conf_filepath.split("/")[:-1]) dataset_name = conf_filename.replace(".json", "") - dataset_path = os.path.realpath(os.path.join(args.dataset_path, dataset_name)) + dataset_path = args.dataset_path if not os.path.exists(conf_filepath): raise FileNotFoundError(conf_filename) - if not os.path.exists(dataset_path): - raise FileNotFoundError(dataset_path) + if not os.path.exists(os.path.join(args.dataset_path, dataset_name)): + raise FileNotFoundError(os.path.join(args.dataset_path, dataset_name)) with open(conf_filepath, "r") as f: conf_file = json.load(f) @@ -219,6 +219,12 @@ def main(): executables_to_run[executable_path] = {"index": []} executables_to_run[executable_path]["index"].append(index) + # Replace index to dataset path + for executable_path in executables_to_run: + for pos, index in enumerate(executables_to_run[executable_path]["index"]): + index["file"] = os.path.join(dataset_path, dataset_name, "index", index["name"]) + executables_to_run[executable_path]["index"][pos] = index + print(executables_to_run) run_build_and_search(conf_file, conf_filename, conf_filedir, diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index ed067de064..119a5c0a73 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -15,10 +15,9 @@ # ################################################################################################## # * compiler function ----------------------------------------------------------------------------- +option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_FAISS_IVF_SQ "Include faiss' brute-force knn algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_FAISS_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) @@ -193,26 +192,18 @@ endif() if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT) ConfigureAnnBench( - NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss ) endif() if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ) ConfigureAnnBench( - NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + NAME FAISS_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss ) endif() -if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT) - ConfigureAnnBench( - NAME FAISS_GPU_IVF_SQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss - ) -endif() - -if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ) - ConfigureAnnBench( - NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss - ) +if(RAFT_ANN_BENCH_USE_FAISS_BFKNN) + ConfigureAnnBench(NAME FAISS_BFKNN PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss) endif() if(RAFT_ANN_BENCH_USE_GGNN) From 82f195ec3c4112a5738a153d9d06724cee090426 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 25 Aug 2023 11:18:45 -0700 Subject: [PATCH 3/5] write build,search results --- bench/ann/run.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/bench/ann/run.py b/bench/ann/run.py index e64148abd8..8da3eadc3b 100644 --- a/bench/ann/run.py +++ b/bench/ann/run.py @@ -41,9 +41,9 @@ def find_executable(algos_conf, algo): executable) build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable) if os.path.exists(conda_path): - return (executable, conda_path) + return (executable, conda_path, algo) elif os.path.exists(build_path): - return (executable, build_path) + return (executable, build_path, algo) else: raise FileNotFoundError(executable) @@ -51,7 +51,7 @@ def find_executable(algos_conf, algo): def run_build_and_search(conf_file, conf_filename, conf_filedir, executables_to_run, dataset_path, force, build, search, k, batch_size): - for executable, ann_executable_path in executables_to_run.keys(): + for executable, ann_executable_path, algo in executables_to_run.keys(): # Need to write temporary configuration temp_conf_filename = f"temporary_{conf_filename}" temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename) @@ -60,13 +60,20 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir, temp_conf["dataset"] = conf_file["dataset"] temp_conf["search_basic_param"] = conf_file["search_basic_param"] temp_conf["index"] = executables_to_run[(executable, - ann_executable_path)]["index"] + ann_executable_path, + algo)]["index"] json.dump(temp_conf, f) + legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result') + os.makedirs(legacy_result_folder, exist_ok=True) if build: + build_folder = os.path.join(legacy_result_folder, "build") + os.makedirs(build_folder, exist_ok=True) cmd = [ann_executable_path, "--build", - "--data_prefix="+dataset_path] + "--data_prefix="+dataset_path, + "--benchmark_out_format=csv", + f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"] if force: cmd = cmd + ["--overwrite"] cmd = cmd + [temp_conf_filepath] @@ -75,16 +82,16 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir, p.wait() if search: - legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result') - os.makedirs(legacy_result_folder, exist_ok=True) + search_folder = os.path.join(legacy_result_folder, "search") + os.makedirs(search_folder, exist_ok=True) cmd = [ann_executable_path, "--search", "--data_prefix="+dataset_path, "--benchmark_counters_tabular", - "--benchmark_out_format=csv", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, - f"--benchmark_out={os.path.join(dataset_path, conf_file['dataset']['name'], 'result', f'{executable}.csv')}"] + "--benchmark_out_format=csv", + f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"] if force: cmd = cmd + ["--overwrite"] cmd = cmd + [temp_conf_filepath] From 74c9a1bc4704f25dfcc0a2c8901b813c75da7883 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 25 Aug 2023 18:44:04 -0700 Subject: [PATCH 4/5] remove data_export, use gbench csvs to plot --- bench/ann/data_export.py | 80 ------------------------------ bench/ann/plot.py | 44 ++++++++++------ bench/ann/run.py | 3 +- docs/source/raft_ann_benchmarks.md | 68 +++++++++---------------- 4 files changed, 53 insertions(+), 142 deletions(-) delete mode 100644 bench/ann/data_export.py diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py deleted file mode 100644 index 87ca330ed9..0000000000 --- a/bench/ann/data_export.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import subprocess -import json - -from pathlib import Path - -def parse_filepaths(fs): - for p in fs: - if p.endswith(".json") and os.path.exists(p): - yield p - else: - for f in Path(p).rglob('*.json'): - yield f.as_posix() - -def export_results(output_filepath, recompute, groundtruth_filepath, - result_filepath): - print(f"Writing output file to: {output_filepath}") - - parsed_filepaths = parse_filepaths(result_filepaths) - - with open(output_filepath, 'w') as out: - out.write("Algo,Recall,QPS\n") - - for fp in parsed_filepaths: - with open(fp, 'r') as f: - data = json.load(f) - for benchmark_case in data["benchmarks"]: - algo = benchmark_case["name"] - recall = benchmark_case["Recall"] - qps = benchmark_case["items_per_second"] - out.write(f"{algo},{recall},{qps}\n") - - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--output", help="Path to the CSV output file", - required=True) - parser.add_argument("--recompute", action="store_true", - help="Recompute metrics") - parser.add_argument("--dataset", - help="Name of the dataset to export results for", - default="glove-100-inner") - parser.add_argument( - "--dataset-path", - help="path to dataset folder", - default=os.path.join(os.getenv("RAFT_HOME"), - "bench", "ann", "data") - ) - - args, result_filepaths = parser.parse_known_args() - - # if nothing is provided - if len(result_filepaths) == 0: - raise ValueError("No filepaths to results were provided") - - groundtruth_filepath = os.path.join(args.dataset_path, args.dataset, - "groundtruth.neighbors.ibin") - export_results(args.output, args.recompute, groundtruth_filepath, - result_filepath) - - -if __name__ == "__main__": - main() diff --git a/bench/ann/plot.py b/bench/ann/plot.py index 0020e398a9..33a1872fe0 100644 --- a/bench/ann/plot.py +++ b/bench/ann/plot.py @@ -192,25 +192,38 @@ def inv_fun(x): plt.close() -def load_all_results(result_filepath): +def load_all_results(dataset_path): results = dict() - with open(result_filepath, 'r') as f: - for line in f.readlines()[1:]: - split_lines = line.split(',') - algo_name = split_lines[0].split('.')[0] - if algo_name not in results: - results[algo_name] = [] - results[algo_name].append([algo_name, float(split_lines[1]), - float(split_lines[2])]) + results_path = os.path.join(dataset_path, "result", "search") + for result_filepath in os.listdir(results_path): + with open(os.path.join(results_path, result_filepath), 'r') as f: + lines = f.readlines() + idx = 0 + for pos, line in enumerate(lines): + if "QPS" in line: + idx = pos + break + + for line in lines[idx+1:]: + split_lines = line.split(',') + algo_name = split_lines[0].split('.')[0].strip("\"") + if algo_name not in results: + results[algo_name] = [] + results[algo_name].append([algo_name, float(split_lines[12]), + float(split_lines[10])]) return results def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--result-csv", help="Path to CSV Results", required=True) - parser.add_argument("--output", help="Path to the PNG output file", - default=f"{os.getcwd()}/out.png") + parser.add_argument("--dataset", help="dataset to download", + default="glove-100-inner") + parser.add_argument("--dataset-path", help="path to dataset folder", + default=os.path.join(os.getenv("RAFT_HOME"), + "bench", "ann", "data")) + parser.add_argument("--output-filename", + default="plot.png") parser.add_argument( "--x-scale", help="Scale to use when drawing the X-axis. \ @@ -228,12 +241,13 @@ def main(): ) args = parser.parse_args() - print(f"writing output to {args.output}") + output_filepath = os.path.join(args.dataset_path, args.dataset, args.output_filename) + print(f"writing output to {output_filepath}") - results = load_all_results(args.result_csv) + results = load_all_results(os.path.join(args.dataset_path, args.dataset)) linestyles = create_linestyles(sorted(results.keys())) - create_plot(results, args.raw, args.x_scale, args.y_scale, args.output, linestyles) + create_plot(results, args.raw, args.x_scale, args.y_scale, output_filepath, linestyles) if __name__ == "__main__": diff --git a/bench/ann/run.py b/bench/ann/run.py index 8da3eadc3b..5c927d5066 100644 --- a/bench/ann/run.py +++ b/bench/ann/run.py @@ -124,6 +124,7 @@ def main(): parser.add_argument( "--dataset", help="dataset whose configuration file will be used", + default="glove-100-inner" ) parser.add_argument( "--dataset-path", @@ -232,8 +233,6 @@ def main(): index["file"] = os.path.join(dataset_path, dataset_name, "index", index["name"]) executables_to_run[executable_path]["index"][pos] = index - print(executables_to_run) - run_build_and_search(conf_file, conf_filename, conf_filedir, executables_to_run, dataset_path, args.force, build, search, diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 67e04ff518..78ff2d96a1 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -21,11 +21,10 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm ## Running the benchmarks ### Usage -There are 4 general steps to running the benchmarks and vizualizing the results: +There are 3 general steps to running the benchmarks and vizualizing the results: 1. Prepare Dataset 2. Build Index and Search Index -3. Evaluate Results -4. Plot Results +3. Plot Results We provide a collection of lightweight Python scripts that are wrappers over lower level scripts and executables to run our benchmarks. Either Python scripts or @@ -47,11 +46,8 @@ python bench/ann/get_dataset.py --dataset deep-image-96-angular --normalize # (2) build and search index python bench/ann/run.py --dataset deep-image-96-inner -# (3) evaluate results -python bench/ann/data_export.py --output out.csv --dataset deep-image-96-inner - -# (4) plot results -python bench/ann/plot.py --result-csv out.csv +# (3) plot results +python bench/ann/plot.py --dataset deep-image-96-inner ``` Configuration files already exist for the following list of the million-scale datasets. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `bench/ann/conf`. @@ -86,11 +82,8 @@ python bench/ann/split_groundtruth.py --groundtruth bench/ann/data/deep-1B/deep_ # (2) build and search index python bench/ann/run.py --dataset deep-1B -# (3) evaluate results -python bench/ann/data_export.py --output out.csv --dataset deep-1B - -# (4) plot results -python bench/ann/plot.py --result-csv out.csv +# (3) plot results +python bench/ann/plot.py --dataset deep-1B ``` The usage of `bench/ann/split-groundtruth.py` is: @@ -119,6 +112,7 @@ options: path to download dataset (default: ${RAFT_HOME}/bench/ann/data) --normalize normalize cosine distance to inner product (default: False) ``` + When option `normalize` is provided to the script, any dataset that has cosine distances will be normalized to inner product. So, for example, the dataset `glove-100-angular` will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`. @@ -140,13 +134,15 @@ available in `raft/cpp/build/`. The usage of the script `bench/ann/run.py` is: ```bash -usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] - -options: -usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f] +usage: run.py [-h] [-k COUNT] [-bs BATCH_SIZE] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] + [-f] options: -h, --help show this help message and exit + -k COUNT, --count COUNT + the number of nearest neighbors to search for (default: 10) + -bs BATCH_SIZE, --batch-size BATCH_SIZE + number of query vectors to use in each query trial (default: 10000) --configuration CONFIGURATION path to configuration file for a dataset (default: None) --dataset DATASET dataset whose configuration file will be used (default: glove-100-inner) @@ -157,14 +153,15 @@ options: --algorithms ALGORITHMS run only comma separated list of named algorithms (default: None) --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) - -k, --count number of nearest neighbors to return - --batch-size number of query vectors to pass into search -f, --force re-run algorithms even if their results already exist (default: False) ``` + `configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset. The configuration file should be name as `.json`. It is optional if the name of the dataset is provided with the `dataset` argument, in which case -a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json` +a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json`. +For every algorithm run by this script, it outputs an index build statistics CSV file in `/build/ +and an index search statistics CSV file in `/search/. `dataset-path` : 1. data is read from `/` @@ -177,45 +174,26 @@ it is assumed both are `True`. `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index is available in `algos.yaml` and not disabled, as well as having an associated executable. -#### Step 3: Evaluating Results -The script `bench/ann/data_export.py` will evaluate results for a dataset whose index has been built -and searched with at least one algorithm. For every result file that is available to the script, the output -will be combined and written to a CSV file. +#### Step 3: Plot Results +The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics +CSV file in `/search/. The usage of this script is: ```bash -usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH] +usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filename OUTPUT_FILENAME] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] options: -h, --help show this help message and exit - --output OUTPUT Path to the CSV output file (default: None) - --recompute Recompute metrics (default: False) - --dataset DATASET Name of the dataset to export results for (default: glove-100-inner) + --dataset DATASET dataset to download (default: glove-100-inner) --dataset-path DATASET_PATH path to dataset folder (default: ${RAFT_HOME}/bench/ann/data) -``` - -#### Step 4: Plot Results -The script `bench/ann/plot.py` will plot all results evaluated to a CSV file for a given dataset. - -The usage of this script is: -```bash -usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] - -options: - -h, --help show this help message and exit - --result-csv RESULT_CSV - Path to CSV Results (default: None) - --output OUTPUT Path to the PNG output file (default: ${RAFT_HOME}/out.png) + --output-filename OUTPUT_FILENAME --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) --raw Show raw results (not just Pareto frontier) in faded colours (default: False) ``` -All algorithms present in the CSV file supplied to this script with parameter `result_csv` -will appear in the plot. - The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall. ![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png) From 902f9f48b34e397c6846d05b0f52016932e4537f Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 25 Aug 2023 18:51:34 -0700 Subject: [PATCH 5/5] fix typo in docs path for results --- docs/source/raft_ann_benchmarks.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 78ff2d96a1..757e9a59b5 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -160,8 +160,8 @@ options: The configuration file should be name as `.json`. It is optional if the name of the dataset is provided with the `dataset` argument, in which case a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/.json`. -For every algorithm run by this script, it outputs an index build statistics CSV file in `/build/ -and an index search statistics CSV file in `/search/. +For every algorithm run by this script, it outputs an index build statistics CSV file in `/result/build/ +and an index search statistics CSV file in `/result/search/. `dataset-path` : 1. data is read from `/` @@ -176,7 +176,7 @@ is available in `algos.yaml` and not disabled, as well as having an associated e #### Step 3: Plot Results The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics -CSV file in `/search/. +CSV file in `/search/result/. The usage of this script is: ```bash