From c48476b75639baddae168f40486cc6864f463ed4 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 02:37:16 +0000 Subject: [PATCH 1/8] pareto frontier in data_export --- docs/source/raft_ann_benchmarks.md | 27 ++- .../raft-ann-bench/data_export/__main__.py | 55 +++++- .../src/raft-ann-bench/plot/__main__.py | 157 +++++++++--------- 3 files changed, 145 insertions(+), 94 deletions(-) diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index e6c4eaedd0..2bf7d97559 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -198,27 +198,32 @@ options: --dataset-path DATASET_PATH path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR}) ``` -Build statistics CSV file is stored in `/result/build/` -and index search statistics CSV file in `/result/search/`. +Build statistics CSV file is stored in `/result/build/` +and index search statistics CSV file in `/result/search/`, where suffix has three values: +1. `raw`: All search results are exported +2. `throughput`: Pareto frontier of throughput results is exported +3. `latency`: Pareto frontier of latency results is exported + ### Step 4: Plot Results The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics -CSV file in `/result/search/<-k{k}-batch_size{batch_size}>.csv`. +CSV files `/result/search/*.csv`. The usage of this script is: ```bash -usage: __main__.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-k COUNT] - [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] +usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] + [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--raw] options: -h, --help show this help message and exit --dataset DATASET dataset to plot (default: glove-100-inner) --dataset-path DATASET_PATH - path to dataset folder (default: os.getcwd()/datasets/) + path to dataset folder (default: /home/coder/raft/datasets/) --output-filepath OUTPUT_FILEPATH - directory for PNG to be saved (default: os.getcwd()) + directory for PNG to be saved (default: /home/coder/raft) --algorithms ALGORITHMS - plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None) + plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default + (default: None) --groups GROUPS plot only comma separated groups of parameters (default: base) --algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS add comma separated . to plot. Example usage: "--algo-groups=raft_cagra.large,hnswlib.large" (default: None) @@ -231,8 +236,12 @@ options: --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) - --raw Show raw results (not just Pareto frontier) in faded colours (default: False) + --mode {throughput,latency} + metric whose Pareto frontier is used on the y-axis (default: throughput) + --raw Show raw results (not just Pareto frontier) of metric arg (default: False) ``` +`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step + `algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted. `groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index 4978c99d60..2e55fdd1fc 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -43,9 +43,26 @@ ) skip_search_cols = ( - set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols + set(["recall", "qps", "latency", "items_per_second", "Recall", "Latency"]) + | skip_build_cols ) +metrics = { + "k-nn": { + "description": "Recall", + "worst": float("-inf"), + "lim": [0.0, 1.03], + }, + "throughput": { + "description": "Queries per second (1/s)", + "worst": float("-inf"), + }, + "latency": { + "description": "Search Latency (s)", + "worst": float("inf"), + }, +} + def read_file(dataset, dataset_path, method): dir = os.path.join(dataset_path, dataset, "result", method) @@ -92,6 +109,31 @@ def convert_json_to_csv_build(dataset, dataset_path): traceback.print_exc() +def create_pointset(data, xn, yn): + xm, ym = (metrics[xn], metrics[yn]) + rev_y = -1 if ym["worst"] < 0 else 1 + rev_x = -1 if xm["worst"] < 0 else 1 + + y_idx = 3 if yn == "throughput" else 4 + data.sort(key=lambda t: (rev_y * t[y_idx], rev_x * t[2])) + + lines = [] + last_x = xm["worst"] + comparator = ( + (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx) + ) + for d in data: + if comparator(d[2], last_x): + last_x = d[2] + lines.append(d) + return lines + + +def get_frontier(df, metric): + lines = create_pointset(df.values.tolist(), "k-nn", metric) + return pd.DataFrame(lines, columns=df.columns) + + def convert_json_to_csv_search(dataset, dataset_path): for file, algo_name, df in read_file(dataset, dataset_path, "search"): try: @@ -105,7 +147,8 @@ def convert_json_to_csv_search(dataset, dataset_path): "algo_name": [algo_name] * len(df), "index_name": df["name"], "recall": df["Recall"], - "qps": df["items_per_second"], + "throughput": df["items_per_second"], + "latency": df["Latency"], } ) for name in df: @@ -141,7 +184,13 @@ def convert_json_to_csv_search(dataset, dataset_path): "appended in the Search CSV" ) - write.to_csv(file.replace(".json", ".csv"), index=False) + write.to_csv(file.replace(".json", "_raw.csv"), index=False) + throughput = get_frontier(write, "throughput") + throughput.to_csv( + file.replace(".json", "_throughput.csv"), index=False + ) + latency = get_frontier(write, "latency") + latency.to_csv(file.replace(".json", "_latency.csv"), index=False) except Exception as e: print( "An error occurred processing file %s (%s). Skipping..." diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index c45ff5b14e..f8f31f25a2 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -38,10 +38,14 @@ "worst": float("-inf"), "lim": [0.0, 1.03], }, - "qps": { + "throughput": { "description": "Queries per second (1/s)", "worst": float("-inf"), }, + "latency": { + "description": "Search Latency (s)", + "worst": float("inf"), + }, } @@ -98,53 +102,20 @@ def create_linestyles(unique_algorithms): ) -def get_up_down(metric): - if metric["worst"] == float("inf"): - return "down" - return "up" - - -def get_left_right(metric): - if metric["worst"] == float("inf"): - return "left" - return "right" - - -def create_pointset(data, xn, yn): - xm, ym = (metrics[xn], metrics[yn]) - rev_y = -1 if ym["worst"] < 0 else 1 - rev_x = -1 if xm["worst"] < 0 else 1 - data.sort(key=lambda t: (rev_y * t[-1], rev_x * t[-2])) - - axs, ays, als, aidxs = [], [], [], [] - # Generate Pareto frontier - xs, ys, ls, idxs = [], [], [], [] - last_x = xm["worst"] - comparator = ( - (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx) - ) - for algo_name, index_name, xv, yv in data: - if not xv or not yv: - continue - axs.append(xv) - ays.append(yv) - als.append(algo_name) - aidxs.append(algo_name) - if comparator(xv, last_x): - last_x = xv - xs.append(xv) - ys.append(yv) - ls.append(algo_name) - idxs.append(index_name) - return xs, ys, ls, idxs, axs, ays, als, aidxs - - def create_plot_search( - all_data, raw, x_scale, y_scale, fn_out, linestyles, dataset, k, batch_size + all_data, + raw, + x_scale, + y_scale, + fn_out, + linestyles, + dataset, + k, + batch_size, + mode, ): xn = "k-nn" - yn = "qps" - xm, ym = (metrics[xn], metrics[yn]) + xm, ym = (metrics[xn], metrics[mode]) # Now generate each plot handles = [] labels = [] @@ -152,17 +123,16 @@ def create_plot_search( # Sorting by mean y-value helps aligning plots with labels def mean_y(algo): - xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset( - all_data[algo], xn, yn - ) - return -np.log(np.array(ys)).mean() + points = np.array(all_data[algo], dtype=object) + print(points[:, 3]) + return -np.log(np.array(points[:, 3], dtype=np.float32)).mean() # Find range for logit x-scale min_x, max_x = 1, 0 for algo in sorted(all_data.keys(), key=mean_y): - xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset( - all_data[algo], xn, yn - ) + points = np.array(all_data[algo], dtype=object) + xs = points[:, 2] + ys = points[:, 3] min_x = min([min_x] + [x for x in xs if x > 0]) max_x = max([max_x] + [x for x in xs if x < 1]) color, faded, linestyle, marker = linestyles[algo] @@ -178,23 +148,12 @@ def mean_y(algo): marker=marker, ) handles.append(handle) - if raw: - (handle2,) = plt.plot( - axs, - ays, - "-", - label=algo, - color=faded, - ms=5, - mew=2, - lw=2, - marker=marker, - ) + labels.append(algo) ax = plt.gca() ax.set_ylabel(ym["description"]) - ax.set_xlabel(xm["description"]) + ax.set_xlabel("Recall") # Custom scales of the type --x-scale a3 if x_scale[0] == "a": alpha = float(x_scale[1:]) @@ -250,10 +209,15 @@ def inv_fun(x): def create_plot_build( - build_results, search_results, linestyles, fn_out, dataset, k, batch_size + build_results, + search_results, + linestyles, + fn_out, + dataset, + k, + batch_size, + mode, ): - xn = "k-nn" - yn = "qps" qps_85 = [-1] * len(linestyles) bt_85 = [0] * len(linestyles) @@ -271,16 +235,17 @@ def create_plot_build( colors = OrderedDict() # Sorting by mean y-value helps aligning plots with labels + def mean_y(algo): - xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset( - search_results[algo], xn, yn - ) - return -np.log(np.array(ys)).mean() + points = np.array(search_results[algo], dtype=object) + return -np.log(np.array(points[:, 3], dtype=np.float32)).mean() for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)): - xs, ys, ls, idxs, axs, ays, als, aidxs = create_pointset( - search_results[algo], xn, yn - ) + points = np.array(search_results[algo], dtype=object) + xs = points[:, 2] + ys = points[:, 3] + ls = points[:, 0] + idxs = points[:, 1] # x is recall, y is qps, ls is algo_name, idxs is index_name for i in range(len(xs)): if xs[i] >= 0.85 and xs[i] < 0.9 and ys[i] > qps_85[pos]: @@ -311,7 +276,7 @@ def mean_y(algo): fig.savefig(fn_out) -def load_lines(results_path, result_files, method, index_key): +def load_lines(results_path, result_files, method, index_key, mode): results = dict() for result_filename in result_files: @@ -323,7 +288,8 @@ def load_lines(results_path, result_files, method, index_key): if method == "build": key_idx = [2] elif method == "search": - key_idx = [2, 3] + y_idx = 3 if mode == "throughput" else 4 + key_idx = [2, y_idx] for line in lines[1:]: split_lines = line.split(",") @@ -354,12 +320,27 @@ def load_all_results( batch_size, method, index_key, + raw, + mode, ): results_path = os.path.join(dataset_path, "result", method) result_files = os.listdir(results_path) - result_files = [ - result_file for result_file in result_files if ".csv" in result_file - ] + if method == "build": + result_files = [ + result_file + for result_file in result_files + if ".csv" in result_file + ] + elif method == "search": + if raw: + suffix = "_raw" + else: + suffix = f"_{mode}" + result_files = [ + result_file + for result_file in result_files + if f"{suffix}.csv" in result_file + ] if method == "search": result_files = [ result_filename @@ -407,7 +388,7 @@ def load_all_results( final_results = final_results + final_algo_groups final_results = set(final_results) - results = load_lines(results_path, final_results, method, index_key) + results = load_lines(results_path, final_results, method, index_key, mode) return results @@ -481,9 +462,15 @@ def main(): choices=["linear", "log", "symlog", "logit"], default="linear", ) + parser.add_argument( + "--mode", + help="metric whose Pareto frontier is used on the y-axis", + choices=["throughput", "latency"], + default="throughput", + ) parser.add_argument( "--raw", - help="Show raw results (not just Pareto frontier) in faded colours", + help="Show raw results (not just Pareto frontier) of metric arg", action="store_true", ) @@ -528,6 +515,8 @@ def main(): batch_size, "search", "algo", + args.raw, + args.mode, ) linestyles = create_linestyles(sorted(search_results.keys())) if search: @@ -541,6 +530,7 @@ def main(): args.dataset, k, batch_size, + args.mode, ) if build: build_results = load_all_results( @@ -552,6 +542,8 @@ def main(): batch_size, "build", "index", + args.raw, + args.mode, ) create_plot_build( build_results, @@ -561,6 +553,7 @@ def main(): args.dataset, k, batch_size, + args.mode, ) From b81a2c4909a20d37c60295f1077c2af47d599517 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 02:39:49 +0000 Subject: [PATCH 2/8] more fixes --- .../src/raft-ann-bench/plot/__main__.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index f8f31f25a2..d4e2a23ed7 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -104,7 +104,6 @@ def create_linestyles(unique_algorithms): def create_plot_search( all_data, - raw, x_scale, y_scale, fn_out, @@ -124,7 +123,6 @@ def create_plot_search( # Sorting by mean y-value helps aligning plots with labels def mean_y(algo): points = np.array(all_data[algo], dtype=object) - print(points[:, 3]) return -np.log(np.array(points[:, 3], dtype=np.float32)).mean() # Find range for logit x-scale @@ -209,14 +207,7 @@ def inv_fun(x): def create_plot_build( - build_results, - search_results, - linestyles, - fn_out, - dataset, - k, - batch_size, - mode, + build_results, search_results, linestyles, fn_out, dataset ): qps_85 = [-1] * len(linestyles) @@ -522,7 +513,6 @@ def main(): if search: create_plot_search( search_results, - args.raw, args.x_scale, args.y_scale, search_output_filepath, @@ -551,9 +541,6 @@ def main(): linestyles, build_output_filepath, args.dataset, - k, - batch_size, - args.mode, ) From d94041590998a1385643078f7d518494b49af02f Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 02:45:06 +0000 Subject: [PATCH 3/8] fix local paths --- docs/source/raft_ann_benchmarks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 2bf7d97559..0b1f953cfa 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -218,9 +218,9 @@ options: -h, --help show this help message and exit --dataset DATASET dataset to plot (default: glove-100-inner) --dataset-path DATASET_PATH - path to dataset folder (default: /home/coder/raft/datasets/) + path to dataset folder (default: os.getcwd()/datasets/) --output-filepath OUTPUT_FILEPATH - directory for PNG to be saved (default: /home/coder/raft) + directory for PNG to be saved (default: os.getcwd()) --algorithms ALGORITHMS plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None) From a43cdb1813f92ec1c0f43838412a67f8b1367f91 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 02:46:04 +0000 Subject: [PATCH 4/8] fix typos --- python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index d4e2a23ed7..a22567b307 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -455,13 +455,13 @@ def main(): ) parser.add_argument( "--mode", - help="metric whose Pareto frontier is used on the y-axis", + help="search mode whose Pareto frontier is used on the y-axis", choices=["throughput", "latency"], default="throughput", ) parser.add_argument( "--raw", - help="Show raw results (not just Pareto frontier) of metric arg", + help="Show raw results (not just Pareto frontier) of mode arg", action="store_true", ) From 6a50f26a7c378b489592a5c10b8f313192885a52 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 02:53:09 +0000 Subject: [PATCH 5/8] fix latency time unit --- .../raft-ann-bench/src/raft-ann-bench/data_export/__main__.py | 2 +- python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index 2e55fdd1fc..77d8bead71 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -58,7 +58,7 @@ "worst": float("-inf"), }, "latency": { - "description": "Search Latency (s)", + "description": "Search Latency (ms)", "worst": float("inf"), }, } diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index a22567b307..a86b94fe67 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -43,7 +43,7 @@ "worst": float("-inf"), }, "latency": { - "description": "Search Latency (s)", + "description": "Search Latency (ms)", "worst": float("inf"), }, } From 782fa680840f44c70f6b22814a375ce4f3d43fcd Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 03:08:27 +0000 Subject: [PATCH 6/8] add fileread safeguards --- .../raft-ann-bench/src/raft-ann-bench/plot/__main__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index a86b94fe67..cfd958b3da 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -271,7 +271,7 @@ def load_lines(results_path, result_files, method, index_key, mode): results = dict() for result_filename in result_files: - if result_filename.endswith(".csv"): + try: with open(os.path.join(results_path, result_filename), "r") as f: lines = f.readlines() lines = lines[:-1] if lines[-1] == "\n" else lines @@ -298,6 +298,11 @@ def load_lines(results_path, result_files, method, index_key, mode): for key_i in key_idx: to_add.append(float(split_lines[key_i])) results[dict_key].append(to_add) + except Exception: + print( + f"An error occurred processing file {result_filename}. " + "Skipping..." + ) return results @@ -332,6 +337,9 @@ def load_all_results( for result_file in result_files if f"{suffix}.csv" in result_file ] + if len(result_files) == 0: + raise FileNotFoundError(f"No CSV result files found in {results_path}") + if method == "search": result_files = [ result_filename From 08fb31a7594b56854b5bfcb19298d5f95545b498 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 17 Nov 2023 16:42:20 +0000 Subject: [PATCH 7/8] add more try-excepts --- .../raft-ann-bench/data_export/__main__.py | 61 ++++++++++++------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index 77d8bead71..88cd4f18ec 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -142,15 +142,21 @@ def convert_json_to_csv_search(dataset, dataset_path): ) algo_name = algo_name.replace("_base", "") df["name"] = df["name"].str.split("/").str[0] - write = pd.DataFrame( - { - "algo_name": [algo_name] * len(df), - "index_name": df["name"], - "recall": df["Recall"], - "throughput": df["items_per_second"], - "latency": df["Latency"], - } - ) + try: + write = pd.DataFrame( + { + "algo_name": [algo_name] * len(df), + "index_name": df["name"], + "recall": df["Recall"], + "throughput": df["items_per_second"], + "latency": df["Latency"], + } + ) + except Exception as e: + print( + "Search file %s (%s) missing a key. Skipping..." + % (file, e) + ) for name in df: if name not in skip_search_cols: write[name] = df[name] @@ -163,20 +169,29 @@ def convert_json_to_csv_search(dataset, dataset_path): write["build cpu_time"] = None write["build GPU"] = None - for col_idx in range(6, len(build_df.columns)): - col_name = build_df.columns[col_idx] - write[col_name] = None - - for s_index, search_row in write.iterrows(): - for b_index, build_row in build_df.iterrows(): - if search_row["index_name"] == build_row["index_name"]: - write.iloc[s_index, write_ncols] = build_df.iloc[ - b_index, 2 - ] - write.iloc[ - s_index, write_ncols + 1 : - ] = build_df.iloc[b_index, 3:] - break + try: + for col_idx in range(6, len(build_df.columns)): + col_name = build_df.columns[col_idx] + write[col_name] = None + + for s_index, search_row in write.iterrows(): + for b_index, build_row in build_df.iterrows(): + if ( + search_row["index_name"] + == build_row["index_name"] + ): + write.iloc[ + s_index, write_ncols + ] = build_df.iloc[b_index, 2] + write.iloc[ + s_index, write_ncols + 1 : + ] = build_df.iloc[b_index, 3:] + break + except Exception as e: + print( + "Build file %s (%s) missing a key. Skipping..." + % (build_file, e) + ) else: warnings.warn( f"Build CSV not found for {algo_name}, " From edbdb24ef33388df6112a716a94649ac40843da0 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 17 Nov 2023 18:44:24 +0000 Subject: [PATCH 8/8] allow time unit scaling for latency mode plots --- docs/source/raft_ann_benchmarks.md | 13 ++++--- .../raft-ann-bench/data_export/__main__.py | 2 +- .../src/raft-ann-bench/plot/__main__.py | 34 ++++++++++++++++--- 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 0b1f953cfa..dcdfc2cec9 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -212,15 +212,16 @@ CSV files `/result/search/*.csv`. The usage of this script is: ```bash usage: [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] - [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--raw] + [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}] + [--raw] options: -h, --help show this help message and exit --dataset DATASET dataset to plot (default: glove-100-inner) --dataset-path DATASET_PATH - path to dataset folder (default: os.getcwd()/datasets/) + path to dataset folder (default: /home/coder/raft/datasets/) --output-filepath OUTPUT_FILEPATH - directory for PNG to be saved (default: os.getcwd()) + directory for PNG to be saved (default: /home/coder/raft) --algorithms ALGORITHMS plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default (default: None) @@ -237,8 +238,10 @@ options: --y-scale {linear,log,symlog,logit} Scale to use when drawing the Y-axis (default: linear) --mode {throughput,latency} - metric whose Pareto frontier is used on the y-axis (default: throughput) - --raw Show raw results (not just Pareto frontier) of metric arg (default: False) + search mode whose Pareto frontier is used on the y-axis (default: throughput) + --time-unit {s,ms,us} + time unit to plot when mode is latency (default: ms) + --raw Show raw results (not just Pareto frontier) of mode arg (default: False) ``` `mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py index 88cd4f18ec..572b81bbe2 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py @@ -58,7 +58,7 @@ "worst": float("-inf"), }, "latency": { - "description": "Search Latency (ms)", + "description": "Search Latency (s)", "worst": float("inf"), }, } diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py index cfd958b3da..8bd54170c9 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py @@ -43,7 +43,7 @@ "worst": float("-inf"), }, "latency": { - "description": "Search Latency (ms)", + "description": "Search Latency (s)", "worst": float("inf"), }, } @@ -112,6 +112,7 @@ def create_plot_search( k, batch_size, mode, + time_unit, ): xn = "k-nn" xm, ym = (metrics[xn], metrics[mode]) @@ -150,7 +151,10 @@ def mean_y(algo): labels.append(algo) ax = plt.gca() - ax.set_ylabel(ym["description"]) + y_description = ym["description"] + if mode == "latency": + y_description = y_description.replace("(s)", f"({time_unit})") + ax.set_ylabel(y_description) ax.set_xlabel("Recall") # Custom scales of the type --x-scale a3 if x_scale[0] == "a": @@ -267,7 +271,7 @@ def mean_y(algo): fig.savefig(fn_out) -def load_lines(results_path, result_files, method, index_key, mode): +def load_lines(results_path, result_files, method, index_key, mode, time_unit): results = dict() for result_filename in result_files: @@ -297,6 +301,16 @@ def load_lines(results_path, result_files, method, index_key, mode): to_add = [algo_name, index_name] for key_i in key_idx: to_add.append(float(split_lines[key_i])) + if ( + mode == "latency" + and time_unit != "s" + and method == "search" + ): + to_add[-1] = ( + to_add[-1] * (10**3) + if time_unit == "ms" + else to_add[-1] * (10**6) + ) results[dict_key].append(to_add) except Exception: print( @@ -318,6 +332,7 @@ def load_all_results( index_key, raw, mode, + time_unit, ): results_path = os.path.join(dataset_path, "result", method) result_files = os.listdir(results_path) @@ -387,7 +402,9 @@ def load_all_results( final_results = final_results + final_algo_groups final_results = set(final_results) - results = load_lines(results_path, final_results, method, index_key, mode) + results = load_lines( + results_path, final_results, method, index_key, mode, time_unit + ) return results @@ -467,6 +484,12 @@ def main(): choices=["throughput", "latency"], default="throughput", ) + parser.add_argument( + "--time-unit", + help="time unit to plot when mode is latency", + choices=["s", "ms", "us"], + default="ms", + ) parser.add_argument( "--raw", help="Show raw results (not just Pareto frontier) of mode arg", @@ -516,6 +539,7 @@ def main(): "algo", args.raw, args.mode, + args.time_unit, ) linestyles = create_linestyles(sorted(search_results.keys())) if search: @@ -529,6 +553,7 @@ def main(): k, batch_size, args.mode, + args.time_unit, ) if build: build_results = load_all_results( @@ -542,6 +567,7 @@ def main(): "index", args.raw, args.mode, + args.time_unit, ) create_plot_build( build_results,