Merge pull request #5 from divyegala/python-ann-bench-use-gbench

Add `data_export.py`
rapidsai · Aug 29, 2023 · 39dd3f4 · 39dd3f4
2 parents c28326c + ef7b7f9
commit 39dd3f4
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 72 deletions.
diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import pandas as pd
+import os
+import json
+
+
+def read_file(dataset, dataset_path, method):
+    dir = os.path.join(dataset_path, dataset, "result", method)
+    for file in os.listdir(dir):
+        if file.endswith(".json"):
+            with open(os.path.join(dir, file), "r") as f:
+                data = json.load(f)
+                df = pd.DataFrame(data["benchmarks"])
+                yield (os.path.join(dir, file), file.split('-')[0], df)
+
+def convert_json_to_csv_build(dataset, dataset_path):
+    for file, algo_name, df in read_file(dataset, dataset_path, "build"):
+        df['name'] = df['name'].str.split('/').str[0]
+        write = pd.DataFrame({'algo_name' : [algo_name] * len(df),
+                              'index_name' : df['name'],
+                              'time' : df['real_time']})
+        write.to_csv(file.replace('.json', '.csv'), index=False)
+
+
+def convert_json_to_csv_search(dataset, dataset_path):
+    for file, algo_name, df in read_file(dataset, dataset_path, "search"):
+        df['name'] = df['name'].str.split('/').str[0]
+        write = pd.DataFrame({'algo_name' : [algo_name] * len(df),
+                              'index_name' : df['name'],
+                              'recall' : df['Recall'],
+                              'qps' : df['items_per_second']})
+        write.to_csv(file.replace('.json', '.csv'), index=False)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--dataset", help="dataset to download",
+                        default="glove-100-inner")
+    parser.add_argument("--dataset-path", help="path to dataset folder",
+                        default=os.path.join(os.getenv("RAFT_HOME"), 
+                                             "bench", "ann", "data"))
+    args = parser.parse_args()
+    convert_json_to_csv_build(args.dataset, args.dataset_path)
+    convert_json_to_csv_search(args.dataset, args.dataset_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/ann/plot.py b/bench/ann/plot.py
@@ -203,18 +203,18 @@ def create_plot_build(build_results, search_results, linestyles, fn_out,
     xn = "k-nn"
     yn = "qps"
 
-    recall_85 = [-1] * len(linestyles)
     qps_85 = [-1] * len(linestyles)
     bt_85 = [0] * len(linestyles)
     i_85 = [-1] * len(linestyles)
-    recall_90 = [-1] * len(linestyles)
+
     qps_90 = [-1] * len(linestyles)
     bt_90 = [0] * len(linestyles)
     i_90 = [-1] * len(linestyles)
-    recall_95 = [-1] * len(linestyles)
+
     qps_95 = [-1] * len(linestyles)
     bt_95 = [0] * len(linestyles)
     i_95 = [-1] * len(linestyles)
+
     data = OrderedDict()
     colors = OrderedDict()
 
@@ -248,7 +248,7 @@ def mean_y(algo):
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
-    print(f"writing search output to {fn_out}")
+    print(f"writing build output to {fn_out}")
     plt.title("Build Time for Highest QPS")
     plt.suptitle(f"{dataset} k={k} batch_size={batch_size}")
     plt.ylabel("Build Time (s)")
@@ -258,45 +258,33 @@ def mean_y(algo):
 def load_lines(results_path, result_files, method, index_key):
     results = dict()
 
-    linebreaker = "name,iterations"
-
     for result_filename in result_files:
-        with open(os.path.join(results_path, result_filename), 'r') as f:
-            lines = f.readlines()
-            lines = lines[:-1] if lines[-1] == "\n" else lines
-            idx = 0
-            for pos, line in enumerate(lines):
-                if linebreaker in line:
-                    idx = pos
-                    break
-
-            if method == "build":
-                if "hnswlib" in result_filename:
+        if result_filename.endswith('.csv'):
+            with open(os.path.join(results_path, result_filename), 'r') as f:
+                lines = f.readlines()
+                lines = lines[:-1] if lines[-1] == "\n" else lines
+
+                if method == "build":
                     key_idx = [2]
-                else:
-                    key_idx = [10]
-            elif method == "search":
-                if "hnswlib" in result_filename:
-                    key_idx = [10, 6]
-                else:
-                    key_idx = [12, 10]
-
-            for line in lines[idx+1:]:
-                split_lines = line.split(',')
-
-                algo_name = split_lines[0].split('.')[0].strip("\"")
-                index_name = split_lines[0].split('/')[0].strip("\"")
-
-                if index_key == "algo":
-                    dict_key = algo_name
-                elif index_key == "index":
-                    dict_key = (algo_name, index_name)
-                if dict_key not in results:
-                    results[dict_key] = []
-                to_add = [algo_name, index_name]
-                for key_i in key_idx:
-                    to_add.append(float(split_lines[key_i]))
-                results[dict_key].append(to_add)
+                elif method == "search":
+                    key_idx = [2, 3]
+
+                for line in lines[1:]:
+                    split_lines = line.split(',')
+
+                    algo_name = split_lines[0]
+                    index_name = split_lines[1]
+
+                    if index_key == "algo":
+                        dict_key = algo_name
+                    elif index_key == "index":
+                        dict_key = (algo_name, index_name)
+                    if dict_key not in results:
+                        results[dict_key] = []
+                    to_add = [algo_name, index_name]
+                    for key_i in key_idx:
+                        to_add.append(float(split_lines[key_i]))
+                    results[dict_key].append(to_add)
 
     return results
 
@@ -375,8 +363,8 @@ def main():
         build = args.build
         search = args.search
 
-    search_output_filepath = os.path.join(args.output_filepath, f"search-{args.dataset}-{k}-{batch_size}.png")
-    build_output_filepath = os.path.join(args.output_filepath, f"build-{args.dataset}-{k}-{batch_size}.png")
+    search_output_filepath = os.path.join(args.output_filepath, f"search-{args.dataset}-k{k}-batch_size{batch_size}.png")
+    build_output_filepath = os.path.join(args.output_filepath, f"build-{args.dataset}-k{k}-batch_size{batch_size}.png")
 
     search_results = load_all_results(
                         os.path.join(args.dataset_path, args.dataset),

diff --git a/bench/ann/run.py b/bench/ann/run.py
@@ -41,9 +41,9 @@ def find_executable(algos_conf, algo, k, batch_size):
                               executable)
     build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
     if os.path.exists(conda_path):
-        return (executable, conda_path, f"{algo}-{k}-{batch_size}")
+        return (executable, conda_path, f"{algo}-k{k}-batch_size{batch_size}")
     elif os.path.exists(build_path):
-        return (executable, build_path, f"{algo}-{k}-{batch_size}")
+        return (executable, build_path, f"{algo}-k{k}-batch_size{batch_size}")
     else:
         raise FileNotFoundError(executable)
 
@@ -72,8 +72,8 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
             cmd = [ann_executable_path,
                    "--build",
                    "--data_prefix="+dataset_path,
-                   "--benchmark_out_format=csv",
-                   f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"]
+                   "--benchmark_out_format=json",
+                   f"--benchmark_out={os.path.join(build_folder, f'{algo}.json')}"]
             if force:
                 cmd = cmd + ["--overwrite"]
             cmd = cmd + [temp_conf_filepath]
@@ -90,9 +90,8 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
                    "--benchmark_counters_tabular",
                    "--override_kv=k:%s" % k,
                    "--override_kv=n_queries:%s" % batch_size,
-                   "--benchmark_min_warmup_time=0.01",
-                   "--benchmark_out_format=csv",
-                   f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"]
+                   "--benchmark_out_format=json",
+                   f"--benchmark_out={os.path.join(search_folder, f'{algo}.json')}"]
             if force:
                 cmd = cmd + ["--overwrite"]
             cmd = cmd + [temp_conf_filepath]

diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp
@@ -52,17 +52,8 @@ auto load_lib(const std::string& algo) -> void*
   auto found = libs.find(algo);
 
   if (found != libs.end()) { return found->second.handle; }
-  auto lib_name        = "lib" + algo + "_ann_bench.so";
-  std::string lib_path = "";
-  if (std::getenv("CONDA_PREFIX") != nullptr) {
-    auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/";
-    if (std::filesystem::exists(conda_path + "ANN_BENCH")) { lib_path = conda_path; }
-  }
-  if (std::getenv("RAFT_HOME") != nullptr) {
-    auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/";
-    if (std::filesystem::exists(build_path + "ANN_BENCH")) { lib_path = build_path; }
-  }
-  return libs.emplace(algo, lib_path + lib_name).first->second.handle;
+  auto lib_name = "lib" + algo + "_ann_bench.so";
+  return libs.emplace(algo, lib_name).first->second.handle;
 }
 
 auto get_fun_name(void* addr) -> std::string

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -171,8 +171,8 @@ dependencies:
           - benchmark>=1.8.2
           - faiss-proc=*=cuda
           - matplotlib
-          - pyyaml
           - pandas
+          - pyyaml
 
   cudatoolkit:
     specific:

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -21,10 +21,11 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm
 ## Running the benchmarks
 
 ### Usage
-There are 3 general steps to running the benchmarks and vizualizing the results:
+There are 4 general steps to running the benchmarks and vizualizing the results:
 1. Prepare Dataset
 2. Build Index and Search Index
-3. Plot Results
+3. Data Export
+4. Plot Results
 
 We provide a collection of lightweight Python scripts that are wrappers over
 lower level scripts and executables to run our benchmarks. Either Python scripts or
@@ -46,7 +47,10 @@ python bench/ann/get_dataset.py --dataset deep-image-96-angular --normalize
 # (2) build and search index
 python bench/ann/run.py --dataset deep-image-96-inner
 
-# (3) plot results
+# (3) export data
+python bench/ann/data_export.py --dataset deep-image-96-inner
+
+# (4) plot results
 python bench/ann/plot.py --dataset deep-image-96-inner
 ```
 
@@ -82,7 +86,10 @@ python bench/ann/split_groundtruth.py --groundtruth bench/ann/data/deep-1B/deep_
 # (2) build and search index
 python bench/ann/run.py --dataset deep-1B
 
-# (3) plot results
+# (3) export data
+python bench/ann/data_export.py --dataset deep-1B
+
+# (4) plot results
 python bench/ann/plot.py --dataset deep-1B
 ```
 
@@ -160,27 +167,44 @@ options:
 The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
 provided with the `dataset` argument, in which case
 a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`.
-For every algorithm run by this script, it outputs an index build statistics CSV file in `<dataset-path/<dataset>/result/build/<algo.csv>
-and an index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo.csv>.
+For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.json>`
+and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.json>`.
 
 `dataset-path` : 
 1. data is read from `<dataset-path>/<dataset>`
 2. indices are built in `<dataset-path>/<dataset>/index`
-3. search results are stored in `<dataset-path>/<dataset>/result`
+3. build/search results are stored in `<dataset-path>/<dataset>/result`
 
 `build` and `search` : if both parameters are not supplied to the script then
 it is assumed both are `True`.
 
 `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index 
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
-#### Step 3: Plot Results
+#### Step 3: Data Export
+The script `bench/ann/data_export.py` will convert the intermediate JSON outputs produced by `bench/ann/run.py` to more
+easily readable CSV files, which are needed to build charts made by `bench/ann/plot.py`.
+
+```bash
+usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH]
+
+options:
+  -h, --help            show this help message and exit
+  --dataset DATASET     dataset to download (default: glove-100-inner)
+  --dataset-path DATASET_PATH
+                        path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
+```
+Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
+and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
+
+#### Step 4: Plot Results
 The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics
-CSV file in `<dataset-path/<dataset>/search/result/<algo.csv>.
+CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
 
 The usage of this script is:
 ```bash
-usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
+usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [-k COUNT] [-bs BATCH_SIZE] [--build] [--search]
+               [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
 
 options:
   -h, --help            show this help message and exit
@@ -189,6 +213,14 @@ options:
                         path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
   --output-filepath OUTPUT_FILEPATH
                         directory for PNG to be saved (default: os.getcwd())
+  --algorithms ALGORITHMS
+                        plot only comma separated list of named algorithms (default: None)
+  -k COUNT, --count COUNT
+                        the number of nearest neighbors to search for (default: 10)
+  -bs BATCH_SIZE, --batch-size BATCH_SIZE
+                        number of query vectors to use in each query trial (default: 10000)
+  --build
+  --search
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)