From 291788697af6fbe0eabbd7136ee1554809f7072a Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 11 Aug 2023 17:35:28 -0700
Subject: [PATCH 1/5] try to run gbench executable

---
 bench/ann/algos.yaml                   |  16 +--
 bench/ann/run.py                       | 173 ++++++++++++++-----------
 cpp/bench/ann/CMakeLists.txt           |  19 ++-
 cpp/bench/ann/src/common/benchmark.cpp |  15 ++-
 4 files changed, 126 insertions(+), 97 deletions(-)

diff --git a/bench/ann/algos.yaml b/bench/ann/algos.yaml
index 5f554fc46b..46d3c9e801 100644
--- a/bench/ann/algos.yaml
+++ b/bench/ann/algos.yaml
@@ -1,30 +1,18 @@
 faiss_gpu_ivf_flat:
-  executable: FAISS_IVF_FLAT_ANN_BENCH
-  disabled: false
-faiss_gpu_flat:
-  executable: FAISS_IVF_FLAT_ANN_BENCH
   disabled: false
 faiss_gpu_ivf_pq:
-  executable: FAISS_IVF_PQ_ANN_BENCH
   disabled: false
 faiss_gpu_ivf_sq:
-  executable: FAISS_IVF_PQ_ANN_BENCH
   disabled: false
-faiss_gpu_bfknn:
-  executable: FAISS_BFKNN_ANN_BENCH
+faiss_gpu_flat:
   disabled: false
 raft_ivf_flat:
-  executable: RAFT_IVF_FLAT_ANN_BENCH
   disabled: false
 raft_ivf_pq:
-  executable: RAFT_IVF_PQ_ANN_BENCH
   disabled: false
 raft_cagra:
-  executable: RAFT_CAGRA_ANN_BENCH
   disabled: false
 ggnn:
-  executable: GGNN_ANN_BENCH
   disabled: false
 hnswlib:
-  executable: HNSWLIB_ANN_BENCH
-  disabled: false
\ No newline at end of file
+  disabled: false
diff --git a/bench/ann/run.py b/bench/ann/run.py
index ebaef1e004..90175f7433 100644
--- a/bench/ann/run.py
+++ b/bench/ann/run.py
@@ -25,54 +25,50 @@ def validate_algorithm(algos_conf, algo):
     return algo in algos_conf_keys and not algos_conf[algo]["disabled"]
 
 
-def find_executable(algos_conf, algo):
-    executable = algos_conf[algo]["executable"]
+def find_executable():
+    executable = "ANN_BENCH"
     conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann",
                               executable)
     build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
     if os.path.exists(conda_path):
-        return (executable, conda_path)
+        return conda_path
     elif os.path.exists(build_path):
-        return (executable, build_path)
+        return build_path
     else:
         raise FileNotFoundError(executable)
 
 
-def run_build_and_search(conf_filename, conf_file, executables_to_run,
+def run_build_and_search(conf_filename, conf_file, dataset_path,
                          force, conf_filedir, build, search):
-    for executable, ann_executable_path in executables_to_run.keys():
-        # Need to write temporary configuration
-        temp_conf_filename = f"temporary_executable_{conf_filename}"
-        temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
-        with open(temp_conf_filepath, "w") as f:
-            temp_conf = dict()
-            temp_conf["dataset"] = conf_file["dataset"]
-            temp_conf["search_basic_param"] = conf_file["search_basic_param"]
-            temp_conf["index"] = executables_to_run[(executable, 
-                                                     ann_executable_path)]["index"]
-            json.dump(temp_conf, f)
-
-        if build:
-            if force:
-                p = subprocess.Popen([ann_executable_path, "-b", "-f",
-                                    temp_conf_filepath])
-                p.wait()
-            else:
-                p = subprocess.Popen([ann_executable_path, "-b",
-                                    temp_conf_filepath])
-                p.wait()
-
-        if search:
-            if force:
-                p = subprocess.Popen([ann_executable_path, "-s", "-f",
-                                      temp_conf_filepath])
-                p.wait()
-            else:
-                p = subprocess.Popen([ann_executable_path, "-s",
-                                      temp_conf_filepath])
-                p.wait()
-
-        os.remove(temp_conf_filepath)
+    ann_executable_path = find_executable()
+
+    # Need to write temporary configuration
+    temp_conf_filename = f"temporary_{conf_filename}"
+    temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
+    with open(temp_conf_filepath, "w") as f:
+        json.dump(conf_file, f)
+
+    data_prefix = "/".join(dataset_path.split("/")[:-1])
+    if build:
+        cmd = [ann_executable_path, "--build", "--data_prefix="+data_prefix]
+        if force:
+            cmd = cmd + ["--overwrite"]
+        cmd = cmd + [temp_conf_filepath]
+        print(cmd)
+        p = subprocess.Popen(cmd)
+        p.wait()
+
+    if search:
+        cmd = [ann_executable_path, "--search", "--benchmark_out_format=csv",
+               "--benchmark_out=" + os.path.join(dataset_path, "result.csv"),
+               "--data_prefix=" + data_prefix]
+        if force:
+            cmd = cmd + ["--overwrite"]
+        cmd = cmd + [temp_conf_filepath]
+        p = subprocess.Popen(cmd)
+        p.wait()
+
+    os.remove(temp_conf_filepath)
 
 
 def main():
@@ -90,7 +86,6 @@ def main():
     parser.add_argument(
         "--dataset",
         help="dataset whose configuration file will be used",
-        default="glove-100-inner"
     )
     parser.add_argument(
         "--dataset-path",
@@ -118,6 +113,12 @@ def main():
                         help="re-run algorithms even if their results \
                               already exist",
                         action="store_true")
+    parser.add_argument("--batch-size",
+                        help="batch size for querying",
+                        default=1)
+    parser.add_argument("--k",
+                        help="k neighbors",
+                        default=10)
 
     args = parser.parse_args()
 
@@ -133,75 +134,93 @@ def main():
     # Read configuration file associated to dataset
     if args.configuration:
         conf_filepath = args.configuration
+    elif args.dataset:
+        conf_filepath = \
+            os.path.join(scripts_path, "conf", f"{args.dataset}.json")
     else:
-        conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json")
+        raise ValueError("One of parameters `configuration` or \
+                         `dataset` need to be provided")
     conf_filename = conf_filepath.split("/")[-1]
     conf_filedir = "/".join(conf_filepath.split("/")[:-1])
-    dataset_name = conf_filename.replace(".json", "")
-    dataset_path = os.path.join(args.dataset_path, dataset_name)
+    dataset = conf_filename.replace(".json", "")
+    dataset_path = os.path.join(args.dataset_path, dataset)
     if not os.path.exists(conf_filepath):
         raise FileNotFoundError(conf_filename)
+    if not os.path.exists(dataset_path):
+        raise FileNotFoundError(dataset_path)
 
     with open(conf_filepath, "r") as f:
         conf_file = json.load(f)
 
-    # Replace base, query to dataset-path
-    conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin")
-    conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin")
-    # Ensure base and query files exist for dataset
-    if not os.path.exists(conf_file["dataset"]["base_file"]):
-        raise FileNotFoundError(conf_file["dataset"]["base_file"])
-    if not os.path.exists(conf_file["dataset"]["query_file"]):
-        raise FileNotFoundError(conf_file["dataset"]["query_file"])
-
-    executables_to_run = dict()
+    # # Replace base, query, gr to dataset-path
+    # conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin")
+    # conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin")
+    # conf_file["dataset"]["groundtruth_neighbors_file"] = \
+    #     os.path.join(dataset_path, "groundtruth.neighbors.ibin")
+    # # Ensure base and query files exist for dataset
+    # if not os.path.exists(conf_file["dataset"]["base_file"]):
+    #     raise FileNotFoundError(conf_file["dataset"]["base_file"])
+    # if not os.path.exists(conf_file["dataset"]["query_file"]):
+    #     raise FileNotFoundError(conf_file["dataset"]["query_file"])
+    # if not os.path.exists(conf_file["dataset"]["groundtruth_neighbors_file"]):
+    #     raise FileNotFoundError(conf_file["dataset"]["groundtruth_neighbors_file"])
+
+    # executables_to_run = dict()
+    indices_to_run = []
     # At least one named index should exist in config file
     if args.indices:
         indices = set(args.indices.split(","))
         # algo associated with index should still be present in algos.yaml
         # and enabled
-        for index in conf_file["index"]:
+        for pos, index in enumerate(conf_file["index"]):
             curr_algo = index["algo"]
             if index["name"] in indices and \
                     validate_algorithm(algos_conf, curr_algo):
-                executable_path = find_executable(algos_conf, curr_algo)
-                if executable_path not in executables_to_run:
-                    executables_to_run[executable_path] = {"index": []}
-                executables_to_run[executable_path]["index"].append(index)
+                # executable_path = find_executable(algos_conf, curr_algo)
+                # if executable_path not in executables_to_run:
+                #     executables_to_run[executable_path] = {"index": []}
+                # executables_to_run[executable_path]["index"].append(index)
+                indices_to_run.append(pos)
 
     # switch to named algorithms if indices parameter is not supplied
     elif args.algorithms:
         algorithms = set(args.algorithms.split(","))
         # pick out algorithms from conf file that exist
         # and are enabled in algos.yaml
-        for index in conf_file["index"]:
+        for pos, index in enumerate(conf_file["index"]):
             curr_algo = index["algo"]
             if curr_algo in algorithms and \
                     validate_algorithm(algos_conf, curr_algo):
-                executable_path = find_executable(algos_conf, curr_algo)
-                if executable_path not in executables_to_run:
-                    executables_to_run[executable_path] = {"index": []}
-                executables_to_run[executable_path]["index"].append(index)
+                # executable_path = find_executable(algos_conf, curr_algo)
+                # if executable_path not in executables_to_run:
+                #     executables_to_run[executable_path] = {"index": []}
+                # executables_to_run[executable_path]["index"].append(index)
+                indices_to_run.append(pos)
 
     # default, try to run all available algorithms
     else:
-        for index in conf_file["index"]:
+        for pos, index in enumerate(conf_file["index"]):
             curr_algo = index["algo"]
             if validate_algorithm(algos_conf, curr_algo):
-                executable_path = find_executable(algos_conf, curr_algo)
-                if executable_path not in executables_to_run:
-                    executables_to_run[executable_path] = {"index": []}
-                executables_to_run[executable_path]["index"].append(index)
-
-    # Replace build, search to dataset path
-    for executable_path in executables_to_run:
-        for pos, index in enumerate(executables_to_run[executable_path]["index"]):
-            index["file"] = os.path.join(dataset_path, "index", index["name"])
-            index["search_result_file"] = \
-                os.path.join(dataset_path, "result", index["name"])
-            executables_to_run[executable_path]["index"][pos] = index
-
-    run_build_and_search(conf_filename, conf_file, executables_to_run,
+                # executable_path = find_executable(algos_conf, curr_algo)
+                # if executable_path not in executables_to_run:
+                #     executables_to_run[executable_path] = {"index": []}
+                # executables_to_run[executable_path]["index"].append(index)
+                indices_to_run.append(pos)
+
+    # filter available indices
+    if len(indices_to_run) == 0:
+        raise ValueError("No indices found to run")
+    conf_file["index"] = [conf_file["index"][i] for i in indices_to_run]
+
+    # Replace index build to dataset path
+    for pos, index in enumerate(conf_file["index"]):
+        index["file"] = os.path.join(dataset_path, "index", index["name"])
+        conf_file["index"][pos] = index
+
+    print(conf_file)
+
+    run_build_and_search(conf_filename, conf_file, dataset_path,
                          args.force, conf_filedir, build, search)
 
 
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 2ccdead89a..6df4df082f 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -15,9 +15,10 @@
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
-option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_SQ "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
@@ -183,18 +184,26 @@ endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+    NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+    NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_BFKNN)
-  ConfigureAnnBench(NAME FAISS_BFKNN PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss)
+if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_IVF_SQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+  )
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)
diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp
index c73f2ed22a..cfffc36515 100644
--- a/cpp/bench/ann/src/common/benchmark.cpp
+++ b/cpp/bench/ann/src/common/benchmark.cpp
@@ -51,7 +51,20 @@ auto load_lib(const std::string& algo) -> void*
 
   if (found != libs.end()) { return found->second.handle; }
   auto lib_name = "lib" + algo + "_ann_bench.so";
-  return libs.emplace(algo, lib_name).first->second.handle;
+  std::string lib_path = "";
+  if (std::getenv("CONDA_PREFIX") != nullptr) {
+    auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/";
+    if (std::filesystem::exists(conda_path + "ANN_BENCH")) {
+      lib_path = conda_path;
+    }
+  }
+  if (std::getenv("RAFT_HOME") != nullptr) {
+    auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/";
+    if (std::filesystem::exists(build_path + "ANN_BENCH")) {
+      lib_path = build_path;
+    }
+  }
+  return libs.emplace(algo, lib_path + lib_name).first->second.handle;
 }
 
 auto get_fun_name(void* addr) -> std::string

From f927f6927bbc1a1288617df5f00850e1d4c32e89 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 24 Aug 2023 14:18:45 -0700
Subject: [PATCH 2/5] compiling, index building successful, search failing

---
 bench/ann/algos.yaml         |  3 +++
 bench/ann/run.py             | 22 ++++++++++++++--------
 cpp/bench/ann/CMakeLists.txt | 19 +++++--------------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/bench/ann/algos.yaml b/bench/ann/algos.yaml
index 16a6c1a895..5f554fc46b 100644
--- a/bench/ann/algos.yaml
+++ b/bench/ann/algos.yaml
@@ -10,6 +10,9 @@ faiss_gpu_ivf_pq:
 faiss_gpu_ivf_sq:
   executable: FAISS_IVF_PQ_ANN_BENCH
   disabled: false
+faiss_gpu_bfknn:
+  executable: FAISS_BFKNN_ANN_BENCH
+  disabled: false
 raft_ivf_flat:
   executable: RAFT_IVF_FLAT_ANN_BENCH
   disabled: false
diff --git a/bench/ann/run.py b/bench/ann/run.py
index dad1ad7a62..e64148abd8 100644
--- a/bench/ann/run.py
+++ b/bench/ann/run.py
@@ -75,19 +75,19 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
             p.wait()
 
         if search:
-            # legacy_result_folder = "result/" + conf_file["dataset"]["name"]
-            # os.makedirs(legacy_result_folder, exist_ok=True)
+            legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result')
+            os.makedirs(legacy_result_folder, exist_ok=True)
             cmd = [ann_executable_path,
                    "--search",
                    "--data_prefix="+dataset_path,
                    "--benchmark_counters_tabular",
-                   "--benchmark_out_format=json",
+                   "--benchmark_out_format=csv",
                    "--override_kv=k:%s" % k,
                    "--override_kv=n_queries:%s" % batch_size,
-                   "--benchmark_out_format=csv",
-                   f"--benchmark_out={os.path.join(dataset_path, 'result.csv')}"]
+                   f"--benchmark_out={os.path.join(dataset_path, conf_file['dataset']['name'], 'result', f'{executable}.csv')}"]
             if force:
                 cmd = cmd + ["--overwrite"]
+            cmd = cmd + [temp_conf_filepath]
             print(cmd)
             p = subprocess.Popen(cmd)
             p.wait()
@@ -171,11 +171,11 @@ def main():
     conf_filename = conf_filepath.split("/")[-1]
     conf_filedir = "/".join(conf_filepath.split("/")[:-1])
     dataset_name = conf_filename.replace(".json", "")
-    dataset_path = os.path.realpath(os.path.join(args.dataset_path, dataset_name))
+    dataset_path = args.dataset_path
     if not os.path.exists(conf_filepath):
         raise FileNotFoundError(conf_filename)
-    if not os.path.exists(dataset_path):
-        raise FileNotFoundError(dataset_path)
+    if not os.path.exists(os.path.join(args.dataset_path, dataset_name)):
+        raise FileNotFoundError(os.path.join(args.dataset_path, dataset_name))
 
     with open(conf_filepath, "r") as f:
         conf_file = json.load(f)
@@ -219,6 +219,12 @@ def main():
                     executables_to_run[executable_path] = {"index": []}
                 executables_to_run[executable_path]["index"].append(index)
 
+    # Replace index to dataset path
+    for executable_path in executables_to_run:
+        for pos, index in enumerate(executables_to_run[executable_path]["index"]):
+            index["file"] = os.path.join(dataset_path, dataset_name, "index", index["name"])
+            executables_to_run[executable_path]["index"][pos] = index
+
     print(executables_to_run)
 
     run_build_and_search(conf_file, conf_filename, conf_filedir, 
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index ed067de064..119a5c0a73 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -15,10 +15,9 @@
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
+option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
-option(RAFT_ANN_BENCH_USE_FAISS_IVF_SQ "Include faiss' brute-force knn algorithm in benchmark" ON)
-option(RAFT_ANN_BENCH_USE_FAISS_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
@@ -193,26 +192,18 @@ endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+    NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+    NAME FAISS_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT)
-  ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_SQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
-  )
-endif()
-
-if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ)
-  ConfigureAnnBench(
-    NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
-  )
+if(RAFT_ANN_BENCH_USE_FAISS_BFKNN)
+  ConfigureAnnBench(NAME FAISS_BFKNN PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss)
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)

From 82f195ec3c4112a5738a153d9d06724cee090426 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 25 Aug 2023 11:18:45 -0700
Subject: [PATCH 3/5] write build,search results

---
 bench/ann/run.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/bench/ann/run.py b/bench/ann/run.py
index e64148abd8..8da3eadc3b 100644
--- a/bench/ann/run.py
+++ b/bench/ann/run.py
@@ -41,9 +41,9 @@ def find_executable(algos_conf, algo):
                               executable)
     build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
     if os.path.exists(conda_path):
-        return (executable, conda_path)
+        return (executable, conda_path, algo)
     elif os.path.exists(build_path):
-        return (executable, build_path)
+        return (executable, build_path, algo)
     else:
         raise FileNotFoundError(executable)
 
@@ -51,7 +51,7 @@ def find_executable(algos_conf, algo):
 def run_build_and_search(conf_file, conf_filename, conf_filedir,
                          executables_to_run, dataset_path, force,
                          build, search, k, batch_size):
-    for executable, ann_executable_path in executables_to_run.keys():
+    for executable, ann_executable_path, algo in executables_to_run.keys():
         # Need to write temporary configuration
         temp_conf_filename = f"temporary_{conf_filename}"
         temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
@@ -60,13 +60,20 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
             temp_conf["dataset"] = conf_file["dataset"]
             temp_conf["search_basic_param"] = conf_file["search_basic_param"]
             temp_conf["index"] = executables_to_run[(executable, 
-                                                     ann_executable_path)]["index"]
+                                                     ann_executable_path,
+                                                     algo)]["index"]
             json.dump(temp_conf, f)
 
+        legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result')
+        os.makedirs(legacy_result_folder, exist_ok=True)
         if build:
+            build_folder = os.path.join(legacy_result_folder, "build")
+            os.makedirs(build_folder, exist_ok=True)
             cmd = [ann_executable_path,
                    "--build",
-                   "--data_prefix="+dataset_path]
+                   "--data_prefix="+dataset_path,
+                   "--benchmark_out_format=csv",
+                   f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"]
             if force:
                 cmd = cmd + ["--overwrite"]
             cmd = cmd + [temp_conf_filepath]
@@ -75,16 +82,16 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
             p.wait()
 
         if search:
-            legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result')
-            os.makedirs(legacy_result_folder, exist_ok=True)
+            search_folder = os.path.join(legacy_result_folder, "search")
+            os.makedirs(search_folder, exist_ok=True)
             cmd = [ann_executable_path,
                    "--search",
                    "--data_prefix="+dataset_path,
                    "--benchmark_counters_tabular",
-                   "--benchmark_out_format=csv",
                    "--override_kv=k:%s" % k,
                    "--override_kv=n_queries:%s" % batch_size,
-                   f"--benchmark_out={os.path.join(dataset_path, conf_file['dataset']['name'], 'result', f'{executable}.csv')}"]
+                   "--benchmark_out_format=csv",
+                   f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"]
             if force:
                 cmd = cmd + ["--overwrite"]
             cmd = cmd + [temp_conf_filepath]

From 74c9a1bc4704f25dfcc0a2c8901b813c75da7883 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 25 Aug 2023 18:44:04 -0700
Subject: [PATCH 4/5] remove data_export, use gbench csvs to plot

---
 bench/ann/data_export.py           | 80 ------------------------------
 bench/ann/plot.py                  | 44 ++++++++++------
 bench/ann/run.py                   |  3 +-
 docs/source/raft_ann_benchmarks.md | 68 +++++++++----------------
 4 files changed, 53 insertions(+), 142 deletions(-)
 delete mode 100644 bench/ann/data_export.py

diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py
deleted file mode 100644
index 87ca330ed9..0000000000
--- a/bench/ann/data_export.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import subprocess
-import json
-
-from pathlib import Path
-
-def parse_filepaths(fs):
-    for p in fs:
-        if p.endswith(".json") and os.path.exists(p):
-            yield p
-        else:
-            for f in Path(p).rglob('*.json'):
-                yield f.as_posix()
-
-def export_results(output_filepath, recompute, groundtruth_filepath,
-                   result_filepath):
-    print(f"Writing output file to: {output_filepath}")
-
-    parsed_filepaths = parse_filepaths(result_filepaths)
-
-    with open(output_filepath, 'w') as out:
-        out.write("Algo,Recall,QPS\n")
-
-        for fp in parsed_filepaths:
-            with open(fp, 'r') as f:
-                data = json.load(f)
-                for benchmark_case in data["benchmarks"]:
-                    algo = benchmark_case["name"]
-                    recall = benchmark_case["Recall"]
-                    qps = benchmark_case["items_per_second"]
-                    out.write(f"{algo},{recall},{qps}\n")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--output", help="Path to the CSV output file",
-                        required=True)
-    parser.add_argument("--recompute", action="store_true",
-                        help="Recompute metrics")
-    parser.add_argument("--dataset",
-                        help="Name of the dataset to export results for",
-                        default="glove-100-inner")
-    parser.add_argument(
-        "--dataset-path",
-        help="path to dataset folder",
-        default=os.path.join(os.getenv("RAFT_HOME"),
-                             "bench", "ann", "data")
-    )
-
-    args, result_filepaths = parser.parse_known_args()
-
-    # if nothing is provided
-    if len(result_filepaths) == 0:
-        raise ValueError("No filepaths to results were provided")
-
-    groundtruth_filepath = os.path.join(args.dataset_path, args.dataset,
-                                        "groundtruth.neighbors.ibin")
-    export_results(args.output, args.recompute, groundtruth_filepath,
-                   result_filepath)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bench/ann/plot.py b/bench/ann/plot.py
index 0020e398a9..33a1872fe0 100644
--- a/bench/ann/plot.py
+++ b/bench/ann/plot.py
@@ -192,25 +192,38 @@ def inv_fun(x):
     plt.close()
 
 
-def load_all_results(result_filepath):
+def load_all_results(dataset_path):
     results = dict()
-    with open(result_filepath, 'r') as f:
-        for line in f.readlines()[1:]:
-            split_lines = line.split(',')
-            algo_name = split_lines[0].split('.')[0]
-            if algo_name not in results:
-                results[algo_name] = []
-            results[algo_name].append([algo_name, float(split_lines[1]), 
-                                  float(split_lines[2])])
+    results_path = os.path.join(dataset_path, "result", "search")
+    for result_filepath in os.listdir(results_path):
+        with open(os.path.join(results_path, result_filepath), 'r') as f:
+            lines = f.readlines()
+            idx = 0
+            for pos, line in enumerate(lines):
+                if "QPS" in line:
+                    idx = pos
+                    break
+            
+            for line in lines[idx+1:]:
+                split_lines = line.split(',')
+                algo_name = split_lines[0].split('.')[0].strip("\"")
+                if algo_name not in results:
+                    results[algo_name] = []
+                results[algo_name].append([algo_name, float(split_lines[12]), 
+                                    float(split_lines[10])])
     return results
 
 
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--result-csv", help="Path to CSV Results", required=True)
-    parser.add_argument("--output", help="Path to the PNG output file",
-                        default=f"{os.getcwd()}/out.png")
+    parser.add_argument("--dataset", help="dataset to download",
+                        default="glove-100-inner")
+    parser.add_argument("--dataset-path", help="path to dataset folder",
+                        default=os.path.join(os.getenv("RAFT_HOME"), 
+                                             "bench", "ann", "data"))
+    parser.add_argument("--output-filename",
+                        default="plot.png")
     parser.add_argument(
         "--x-scale",
         help="Scale to use when drawing the X-axis. \
@@ -228,12 +241,13 @@ def main():
     )
     args = parser.parse_args()
 
-    print(f"writing output to {args.output}")
+    output_filepath = os.path.join(args.dataset_path, args.dataset, args.output_filename)
+    print(f"writing output to {output_filepath}")
 
-    results = load_all_results(args.result_csv)
+    results = load_all_results(os.path.join(args.dataset_path, args.dataset))
     linestyles = create_linestyles(sorted(results.keys()))
 
-    create_plot(results, args.raw, args.x_scale, args.y_scale, args.output, linestyles)
+    create_plot(results, args.raw, args.x_scale, args.y_scale, output_filepath, linestyles)
 
 
 if __name__ == "__main__":
diff --git a/bench/ann/run.py b/bench/ann/run.py
index 8da3eadc3b..5c927d5066 100644
--- a/bench/ann/run.py
+++ b/bench/ann/run.py
@@ -124,6 +124,7 @@ def main():
     parser.add_argument(
         "--dataset",
         help="dataset whose configuration file will be used",
+        default="glove-100-inner"
     )
     parser.add_argument(
         "--dataset-path",
@@ -232,8 +233,6 @@ def main():
             index["file"] = os.path.join(dataset_path, dataset_name, "index", index["name"])
             executables_to_run[executable_path]["index"][pos] = index
 
-    print(executables_to_run)
-
     run_build_and_search(conf_file, conf_filename, conf_filedir, 
                          executables_to_run, dataset_path,
                          args.force, build, search,
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 67e04ff518..78ff2d96a1 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -21,11 +21,10 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm
 ## Running the benchmarks
 
 ### Usage
-There are 4 general steps to running the benchmarks and vizualizing the results:
+There are 3 general steps to running the benchmarks and vizualizing the results:
 1. Prepare Dataset
 2. Build Index and Search Index
-3. Evaluate Results
-4. Plot Results
+3. Plot Results
 
 We provide a collection of lightweight Python scripts that are wrappers over
 lower level scripts and executables to run our benchmarks. Either Python scripts or
@@ -47,11 +46,8 @@ python bench/ann/get_dataset.py --dataset deep-image-96-angular --normalize
 # (2) build and search index
 python bench/ann/run.py --dataset deep-image-96-inner
 
-# (3) evaluate results
-python bench/ann/data_export.py --output out.csv --dataset deep-image-96-inner
-
-# (4) plot results
-python bench/ann/plot.py --result-csv out.csv
+# (3) plot results
+python bench/ann/plot.py --dataset deep-image-96-inner
 ```
 
 Configuration files already exist for the following list of the million-scale datasets. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `bench/ann/conf`.
@@ -86,11 +82,8 @@ python bench/ann/split_groundtruth.py --groundtruth bench/ann/data/deep-1B/deep_
 # (2) build and search index
 python bench/ann/run.py --dataset deep-1B
 
-# (3) evaluate results
-python bench/ann/data_export.py --output out.csv --dataset deep-1B
-
-# (4) plot results
-python bench/ann/plot.py --result-csv out.csv
+# (3) plot results
+python bench/ann/plot.py --dataset deep-1B
 ```
 
 The usage of `bench/ann/split-groundtruth.py` is:
@@ -119,6 +112,7 @@ options:
                         path to download dataset (default: ${RAFT_HOME}/bench/ann/data)
   --normalize           normalize cosine distance to inner product (default: False)
 ```
+
 When option `normalize` is provided to the script, any dataset that has cosine distances
 will be normalized to inner product. So, for example, the dataset `glove-100-angular` 
 will be written at location `${RAFT_HOME}/bench/ann/data/glove-100-inner/`.
@@ -140,13 +134,15 @@ available in `raft/cpp/build/`.
 
 The usage of the script `bench/ann/run.py` is:
 ```bash
-usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f]
-
-options:
-usage: run.py [-h] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [-f]
+usage: run.py [-h] [-k COUNT] [-bs BATCH_SIZE] [--configuration CONFIGURATION] [--dataset DATASET] [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES]
+              [-f]
 
 options:
   -h, --help            show this help message and exit
+  -k COUNT, --count COUNT
+                        the number of nearest neighbors to search for (default: 10)
+  -bs BATCH_SIZE, --batch-size BATCH_SIZE
+                        number of query vectors to use in each query trial (default: 10000)
   --configuration CONFIGURATION
                         path to configuration file for a dataset (default: None)
   --dataset DATASET     dataset whose configuration file will be used (default: glove-100-inner)
@@ -157,14 +153,15 @@ options:
   --algorithms ALGORITHMS
                         run only comma separated list of named algorithms (default: None)
   --indices INDICES     run only comma separated list of named indices. parameter `algorithms` is ignored (default: None)
-  -k, --count           number of nearest neighbors to return
-  --batch-size          number of query vectors to pass into search
   -f, --force           re-run algorithms even if their results already exist (default: False)
 ```
+
 `configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset.
 The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
 provided with the `dataset` argument, in which case
-a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`
+a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`.
+For every algorithm run by this script, it outputs an index build statistics CSV file in `<dataset-path/<dataset>/build/<algo.csv>
+and an index search statistics CSV file in `<dataset-path/<dataset>/search/<algo.csv>.
 
 `dataset-path` : 
 1. data is read from `<dataset-path>/<dataset>`
@@ -177,45 +174,26 @@ it is assumed both are `True`.
 `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index 
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
-#### Step 3: Evaluating Results
-The script `bench/ann/data_export.py` will evaluate results for a dataset whose index has been built
-and searched with at least one algorithm. For every result file that is available to the script, the output
-will be combined and written to a CSV file.
+#### Step 3: Plot Results
+The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics
+CSV file in `<dataset-path/<dataset>/search/<algo.csv>.
 
 The usage of this script is:
 ```bash
-usage: data_export.py [-h] --output OUTPUT [--recompute] [--dataset DATASET] [--dataset-path DATASET_PATH]
+usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filename OUTPUT_FILENAME] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
 
 options:
   -h, --help            show this help message and exit
-  --output OUTPUT       Path to the CSV output file (default: None)
-  --recompute           Recompute metrics (default: False)
-  --dataset DATASET     Name of the dataset to export results for (default: glove-100-inner)
+  --dataset DATASET     dataset to download (default: glove-100-inner)
   --dataset-path DATASET_PATH
                         path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
-```
-
-#### Step 4: Plot Results
-The script `bench/ann/plot.py` will plot all results evaluated to a CSV file for a given dataset.
-
-The usage of this script is:
-```bash
-usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
-
-options:
-  -h, --help            show this help message and exit
-  --result-csv RESULT_CSV
-                        Path to CSV Results (default: None)
-  --output OUTPUT       Path to the PNG output file (default: ${RAFT_HOME}/out.png)
+  --output-filename OUTPUT_FILENAME
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)
   --raw                 Show raw results (not just Pareto frontier) in faded colours (default: False)
 ```
 
-All algorithms present in the CSV file supplied to this script with parameter `result_csv`
-will appear in the plot.
-
 The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall.
 
 ![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png)

From 902f9f48b34e397c6846d05b0f52016932e4537f Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 25 Aug 2023 18:51:34 -0700
Subject: [PATCH 5/5] fix typo in docs path for results

---
 docs/source/raft_ann_benchmarks.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 78ff2d96a1..757e9a59b5 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -160,8 +160,8 @@ options:
 The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
 provided with the `dataset` argument, in which case
 a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`.
-For every algorithm run by this script, it outputs an index build statistics CSV file in `<dataset-path/<dataset>/build/<algo.csv>
-and an index search statistics CSV file in `<dataset-path/<dataset>/search/<algo.csv>.
+For every algorithm run by this script, it outputs an index build statistics CSV file in `<dataset-path/<dataset>/result/build/<algo.csv>
+and an index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo.csv>.
 
 `dataset-path` : 
 1. data is read from `<dataset-path>/<dataset>`
@@ -176,7 +176,7 @@ is available in `algos.yaml` and not disabled, as well as having an associated e
 
 #### Step 3: Plot Results
 The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics
-CSV file in `<dataset-path/<dataset>/search/<algo.csv>.
+CSV file in `<dataset-path/<dataset>/search/result/<algo.csv>.
 
 The usage of this script is:
 ```bash