From f70a258b433f7e6a4d745ffd343fdbb102a3f226 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Thu, 21 Dec 2023 10:22:51 -0500 Subject: [PATCH] Update `raft-ann-bench` output filenames and add features to plotting (#2043) This PR: 1. Adds more clarity to filenames by using `,` as separator instead of `_` 2. Adds 80% and 99% recall bars to build plots 3. Does not plot a recall level in build plot if no data is present 4. Adds a `x-start` argument which allows controlling the minimum recall level used on the x-axis of the search plot 5. Fixes sometimes occurring multi-line issue in search plots 6. Build time plots now plot average build times for an index corresponding a search query in each recall range Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/2043 --- docs/source/raft_ann_benchmarks.md | 13 +- .../raft-ann-bench/data_export/__main__.py | 34 +++-- .../src/raft-ann-bench/plot/__main__.py | 131 ++++++++++++------ .../src/raft-ann-bench/run/__main__.py | 33 +++-- 4 files changed, 136 insertions(+), 75 deletions(-) diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index dcdfc2cec9..a2fe820317 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -171,8 +171,8 @@ options: `algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `.`, or for example, `raft_cagra.large` -For every algorithm run by this script, it outputs an index build statistics JSON file in `/result/build/` -and an index search statistics JSON file in `/result/search/`. NOTE: The filenams will not have "_{group}" if `group = "base"`. +For every algorithm run by this script, it outputs an index build statistics JSON file in `/result/build/<{algo},{group}.json>` +and an index search statistics JSON file in `/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`. `dataset-path` : 1. data is read from `/` @@ -198,8 +198,8 @@ options: --dataset-path DATASET_PATH path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR}) ``` -Build statistics CSV file is stored in `/result/build/` -and index search statistics CSV file in `/result/search/`, where suffix has three values: +Build statistics CSV file is stored in `/result/build/<{algo},{group}.csv>` +and index search statistics CSV file in `/result/search/<{algo},{group},k{k},bs{batch_size},{suffix}.csv>`, where suffix has three values: 1. `raw`: All search results are exported 2. `throughput`: Pareto frontier of throughput results is exported 3. `latency`: Pareto frontier of latency results is exported @@ -212,8 +212,8 @@ CSV files `/result/search/*.csv`. The usage of this script is: ```bash usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] - [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}] - [--raw] + [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--x-start X_START] [--mode {throughput,latency}] + [--time-unit {s,ms,us}] [--raw] options: -h, --help show this help message and exit @@ -237,6 +237,7 @@ options: --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) + --x-start X_START Recall values to start the x-axis from (default: 0.8) --mode {throughput,latency} search mode whose Pareto frontier is used on the y-axis (default: throughput) --time-unit {s,ms,us} diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index 5cb06c573f..c8a6375577 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -74,7 +74,9 @@ def read_file(dataset, dataset_path, method): try: data = json.load(f) df = pd.DataFrame(data["benchmarks"]) - yield os.path.join(dir, file), file.split("-")[0], df + filename_split = file.split(",") + algo_name = (filename_split[0], filename_split[1]) + yield os.path.join(dir, file), algo_name, df except Exception as e: print( "An error occurred processing file %s (%s). " @@ -85,7 +87,10 @@ def read_file(dataset, dataset_path, method): def convert_json_to_csv_build(dataset, dataset_path): for file, algo_name, df in read_file(dataset, dataset_path, "build"): try: - algo_name = algo_name.replace("_base", "") + if "base" in algo_name[1]: + algo_name = algo_name[0] + else: + algo_name = "_".join(algo_name) df["name"] = df["name"].str.split("/").str[0] write = pd.DataFrame( { @@ -97,12 +102,7 @@ def convert_json_to_csv_build(dataset, dataset_path): for name in df: if name not in skip_build_cols: write[name] = df[name] - filepath = os.path.normpath(file).split(os.sep) - filename = filepath[-1].split("-")[0] + ".csv" - write.to_csv( - os.path.join(f"{os.sep}".join(filepath[:-1]), filename), - index=False, - ) + write.to_csv(file.replace(".json", ".csv"), index=False) except Exception as e: print( "An error occurred processing file %s (%s). Skipping..." @@ -140,9 +140,17 @@ def convert_json_to_csv_search(dataset, dataset_path): for file, algo_name, df in read_file(dataset, dataset_path, "search"): try: build_file = os.path.join( - dataset_path, dataset, "result", "build", f"{algo_name}.csv" + dataset_path, + dataset, + "result", + "build", + f"{','.join(algo_name)}.csv", ) - algo_name = algo_name.replace("_base", "") + print(build_file) + if "base" in algo_name[1]: + algo_name = algo_name[0] + else: + algo_name = "_".join(algo_name) df["name"] = df["name"].str.split("/").str[0] try: write = pd.DataFrame( @@ -201,13 +209,13 @@ def convert_json_to_csv_search(dataset, dataset_path): "appended in the Search CSV" ) - write.to_csv(file.replace(".json", "_raw.csv"), index=False) + write.to_csv(file.replace(".json", ",raw.csv"), index=False) throughput = get_frontier(write, "throughput") throughput.to_csv( - file.replace(".json", "_throughput.csv"), index=False + file.replace(".json", ",throughput.csv"), index=False ) latency = get_frontier(write, "latency") - latency.to_csv(file.replace(".json", "_latency.csv"), index=False) + latency.to_csv(file.replace(".json", ",latency.csv"), index=False) except Exception as e: print( "An error occurred processing file %s (%s). Skipping..." diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index 8bd54170c9..86fd527f5f 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -62,6 +62,19 @@ def positive_int(input_str: str) -> int: return i +def positive_float(input_str: str) -> float: + try: + i = float(input_str) + if i < 0.0: + raise ValueError + except ValueError: + raise argparse.ArgumentTypeError( + f"{input_str} is not a positive float" + ) + + return i + + def generate_n_colors(n): vs = np.linspace(0.3, 0.9, 7) colors = [(0.9, 0.4, 0.4, 1.0)] @@ -113,9 +126,11 @@ def create_plot_search( batch_size, mode, time_unit, + x_start, ): xn = "k-nn" xm, ym = (metrics[xn], metrics[mode]) + xm["lim"][0] = x_start # Now generate each plot handles = [] labels = [] @@ -211,20 +226,15 @@ def inv_fun(x): def create_plot_build( - build_results, search_results, linestyles, fn_out, dataset + build_results, search_results, linestyles, fn_out, dataset, k, batch_size ): + bt_80 = [0] * len(linestyles) - qps_85 = [-1] * len(linestyles) - bt_85 = [0] * len(linestyles) - i_85 = [-1] * len(linestyles) - - qps_90 = [-1] * len(linestyles) bt_90 = [0] * len(linestyles) - i_90 = [-1] * len(linestyles) - qps_95 = [-1] * len(linestyles) bt_95 = [0] * len(linestyles) - i_95 = [-1] * len(linestyles) + + bt_99 = [0] * len(linestyles) data = OrderedDict() colors = OrderedDict() @@ -237,35 +247,59 @@ def mean_y(algo): for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)): points = np.array(search_results[algo], dtype=object) + # x is recall, ls is algo_name, idxs is index_name xs = points[:, 2] - ys = points[:, 3] ls = points[:, 0] idxs = points[:, 1] - # x is recall, y is qps, ls is algo_name, idxs is index_name + + len_80, len_90, len_95, len_99 = 0, 0, 0, 0 for i in range(len(xs)): - if xs[i] >= 0.85 and xs[i] < 0.9 and ys[i] > qps_85[pos]: - qps_85[pos] = ys[i] - bt_85[pos] = build_results[(ls[i], idxs[i])][0][2] - i_85[pos] = idxs[i] - elif xs[i] >= 0.9 and xs[i] < 0.95 and ys[i] > qps_90[pos]: - qps_90[pos] = ys[i] - bt_90[pos] = build_results[(ls[i], idxs[i])][0][2] - i_90[pos] = idxs[i] - elif xs[i] >= 0.95 and ys[i] > qps_95[pos]: - qps_95[pos] = ys[i] - bt_95[pos] = build_results[(ls[i], idxs[i])][0][2] - i_95[pos] = idxs[i] - data[algo] = [bt_85[pos], bt_90[pos], bt_95[pos]] + if xs[i] >= 0.80 and xs[i] < 0.90: + bt_80[pos] = bt_80[pos] + build_results[(ls[i], idxs[i])][0][2] + len_80 = len_80 + 1 + elif xs[i] >= 0.9 and xs[i] < 0.95: + bt_90[pos] = bt_90[pos] + build_results[(ls[i], idxs[i])][0][2] + len_90 = len_90 + 1 + elif xs[i] >= 0.95 and xs[i] < 0.99: + bt_95[pos] = bt_95[pos] + build_results[(ls[i], idxs[i])][0][2] + len_95 = len_95 + 1 + elif xs[i] >= 0.99: + bt_99[pos] = bt_99[pos] + build_results[(ls[i], idxs[i])][0][2] + len_99 = len_99 + 1 + if len_80 > 0: + bt_80[pos] = bt_80[pos] / len_80 + if len_90 > 0: + bt_90[pos] = bt_90[pos] / len_90 + if len_95 > 0: + bt_95[pos] = bt_95[pos] / len_95 + if len_99 > 0: + bt_99[pos] = bt_99[pos] / len_99 + data[algo] = [ + bt_80[pos], + bt_90[pos], + bt_95[pos], + bt_99[pos], + ] colors[algo] = linestyles[algo][0] - index = ["@85% Recall", "@90% Recall", "@95% Recall"] + index = [ + "@80% Recall", + "@90% Recall", + "@95% Recall", + "@99% Recall", + ] df = pd.DataFrame(data, index=index) + df.replace(0.0, np.nan, inplace=True) + df = df.dropna(how="all") plt.figure(figsize=(12, 9)) ax = df.plot.bar(rot=0, color=colors) fig = ax.get_figure() print(f"writing build output to {fn_out}") - plt.title("Build Time for Highest QPS") + plt.title( + "Average Build Time within Recall Range " + f"for k={k} batch_size={batch_size}" + ) plt.suptitle(f"{dataset}") plt.ylabel("Build Time (s)") fig.savefig(fn_out) @@ -344,9 +378,9 @@ def load_all_results( ] elif method == "search": if raw: - suffix = "_raw" + suffix = ",raw" else: - suffix = f"_{mode}" + suffix = f",{mode}" result_files = [ result_file for result_file in result_files @@ -356,22 +390,20 @@ def load_all_results( raise FileNotFoundError(f"No CSV result files found in {results_path}") if method == "search": - result_files = [ - result_filename - for result_filename in result_files - if f"{k}-{batch_size}" in result_filename - ] - algo_group_files = [ - result_filename.split("-")[0] for result_filename in result_files - ] - else: - algo_group_files = [ - result_filename for result_filename in result_files - ] - - for i in range(len(algo_group_files)): - algo_group = algo_group_files[i].replace(".csv", "").split("_") - algo_group_files[i] = ("_".join(algo_group[:-1]), algo_group[-1]) + filter_k_bs = [] + for result_filename in result_files: + filename_split = result_filename.split(",") + if ( + int(filename_split[-3][1:]) == k + and int(filename_split[-2][2:]) == batch_size + ): + filter_k_bs.append(result_filename) + result_files = filter_k_bs + + algo_group_files = [ + result_filename.replace(".csv", "").split(",")[:2] + for result_filename in result_files + ] algo_group_files = list(zip(*algo_group_files)) if len(algorithms) > 0: @@ -478,6 +510,12 @@ def main(): choices=["linear", "log", "symlog", "logit"], default="linear", ) + parser.add_argument( + "--x-start", + help="Recall values to start the x-axis from", + default=0.8, + type=positive_float, + ) parser.add_argument( "--mode", help="search mode whose Pareto frontier is used on the y-axis", @@ -525,7 +563,7 @@ def main(): ) build_output_filepath = os.path.join( args.output_filepath, - f"build-{args.dataset}.png", + f"build-{args.dataset}-k{k}-batch_size{batch_size}.png", ) search_results = load_all_results( @@ -554,6 +592,7 @@ def main(): batch_size, args.mode, args.time_unit, + args.x_start, ) if build: build_results = load_all_results( @@ -575,6 +614,8 @@ def main(): linestyles, build_output_filepath, args.dataset, + k, + batch_size, ) diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index a1f97d67d5..52d536c2e8 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -115,14 +115,16 @@ def validate_algorithm(algos_conf, algo, gpu_present): def find_executable(algos_conf, algo, group, k, batch_size): executable = algos_conf[algo]["executable"] - return_str = f"{algo}_{group}-{k}-{batch_size}" + file_name = (f"{algo},{group}", f"{algo},{group},k{k},bs{batch_size}") build_path = os.getenv("RAFT_HOME") if build_path is not None: - build_path = os.path.join(build_path, "cpp", "build", executable) + build_path = os.path.join( + build_path, "cpp", "build", "release", executable + ) if os.path.exists(build_path): print(f"-- Using RAFT bench from repository in {build_path}. ") - return (executable, build_path, return_str) + return (executable, build_path, file_name) # if there is no build folder present, we look in the conda environment conda_path = os.getenv("CONDA_PREFIX") @@ -130,7 +132,7 @@ def find_executable(algos_conf, algo, group, k, batch_size): conda_path = os.path.join(conda_path, "bin", "ann", executable) if os.path.exists(conda_path): print("-- Using RAFT bench found in conda environment. ") - return (executable, conda_path, return_str) + return (executable, conda_path, file_name) else: raise FileNotFoundError(executable) @@ -152,15 +154,21 @@ def run_build_and_search( mode="throughput", raft_log_level="info", ): - for executable, ann_executable_path, algo in executables_to_run.keys(): + for ( + executable, + ann_executable_path, + output_filename, + ) in executables_to_run.keys(): # Need to write temporary configuration - temp_conf_filename = f"{conf_filename}_{algo}_{uuid.uuid1()}.json" + temp_conf_filename = ( + f"{conf_filename}_{output_filename[1]}_{uuid.uuid1()}.json" + ) with open(temp_conf_filename, "w") as f: temp_conf = dict() temp_conf["dataset"] = conf_file["dataset"] temp_conf["search_basic_param"] = conf_file["search_basic_param"] temp_conf["index"] = executables_to_run[ - (executable, ann_executable_path, algo) + (executable, ann_executable_path, output_filename) ]["index"] json_str = json.dumps(temp_conf, indent=2) f.write(json_str) @@ -172,7 +180,7 @@ def run_build_and_search( if build: build_folder = os.path.join(legacy_result_folder, "build") os.makedirs(build_folder, exist_ok=True) - build_file = f"{algo}.json" + build_file = f"{output_filename[0]}.json" temp_build_file = f"{build_file}.lock" cmd = [ ann_executable_path, @@ -190,7 +198,8 @@ def run_build_and_search( if dry_run: print( - "Benchmark command for %s:\n%s\n" % (algo, " ".join(cmd)) + "Benchmark command for %s:\n%s\n" + % (output_filename[0], " ".join(cmd)) ) else: try: @@ -208,6 +217,7 @@ def run_build_and_search( if search: search_folder = os.path.join(legacy_result_folder, "search") os.makedirs(search_folder, exist_ok=True) + search_file = f"{output_filename[1]}.json" cmd = [ ann_executable_path, "--search", @@ -219,7 +229,7 @@ def run_build_and_search( "--benchmark_out_format=json", "--mode=%s" % mode, "--benchmark_out=" - + f"{os.path.join(search_folder, f'{algo}.json')}", + + f"{os.path.join(search_folder, search_file)}", "--raft_log_level=" + f"{parse_log_level(raft_log_level)}", ] if force: @@ -231,7 +241,8 @@ def run_build_and_search( cmd = cmd + [temp_conf_filename] if dry_run: print( - "Benchmark command for %s:\n%s\n" % (algo, " ".join(cmd)) + "Benchmark command for %s:\n%s\n" + % (output_filename[1], " ".join(cmd)) ) else: try: