From b571ba9009f69e0b325b353cec77f4ece721e3c1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Feb 2023 12:11:49 -0500
Subject: [PATCH 01/39] Initial commit of cuann_benchmarks code. Squashed
 branch to remove IP.

---
 build.sh                                      |   29 +-
 cpp/CMakeLists.txt                            |    8 +
 cpp/cmake/thirdparty/get_ggnn.cmake           |   37 +
 cpp/cmake/thirdparty/get_hnswlib.cmake        |   37 +
 cpp/cmake/thirdparty/get_nlohmann_json.cmake  |   39 +
 cpp/cuann_bench/CMakeLists.txt                |  121 +
 cpp/cuann_bench/README.md                     |  325 +++
 cpp/cuann_bench/conf/bigann-100M.json         |  342 +++
 cpp/cuann_bench/conf/bigann-1B.json           |  139 ++
 cpp/cuann_bench/conf/deep-100M.fp16.json      |   50 +
 cpp/cuann_bench/conf/deep-100M.json           |  521 +++++
 cpp/cuann_bench/conf/deep-1B.json             |  304 +++
 cpp/cuann_bench/conf/glove-100-inner.json     | 1506 ++++++++++++
 cpp/cuann_bench/conf/sift-128-euclidean.json  | 2023 +++++++++++++++++
 cpp/cuann_bench/scripts/eval.pl               |  430 ++++
 cpp/cuann_bench/scripts/fbin_to_f16bin.py     |   46 +
 cpp/cuann_bench/scripts/hdf5_to_fbin.py       |   84 +
 cpp/cuann_bench/scripts/split_groundtruth.pl  |   45 +
 cpp/cuann_bench/src/ann.h                     |   89 +
 cpp/cuann_bench/src/benchmark.cu              |  555 +++++
 cpp/cuann_bench/src/conf.cpp                  |  136 ++
 cpp/cuann_bench/src/conf.h                    |   76 +
 cpp/cuann_bench/src/cudart_util.h             |   64 +
 cpp/cuann_bench/src/dataset.h                 |  385 ++++
 cpp/cuann_bench/src/factory.cuh               |  379 +++
 cpp/cuann_bench/src/faiss_wrapper.h           |  315 +++
 cpp/cuann_bench/src/ggnn_wrapper.cuh          |  309 +++
 cpp/cuann_bench/src/hnswlib_wrapper.h         |  333 +++
 cpp/cuann_bench/src/multigpu.cuh              |  515 +++++
 cpp/cuann_bench/src/raft_cuann_utils.h        |   49 +
 cpp/cuann_bench/src/raft_ivf_flat.cu          |   21 +
 cpp/cuann_bench/src/raft_ivf_flat_wrapper.h   |  146 ++
 cpp/cuann_bench/src/raft_ivf_pq.cu            |   21 +
 cpp/cuann_bench/src/raft_ivf_pq_wrapper.h     |  225 ++
 cpp/cuann_bench/src/raft_wrapper.h            |  155 ++
 cpp/cuann_bench/src/util.cpp                  |   68 +
 cpp/cuann_bench/src/util.h                    |   82 +
 .../third_party/patches/ggnn.patch            |  206 ++
 .../third_party/patches/json.patch            |   38 +
 dependencies.yaml                             |   13 +
 40 files changed, 10255 insertions(+), 11 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_ggnn.cmake
 create mode 100644 cpp/cmake/thirdparty/get_hnswlib.cmake
 create mode 100644 cpp/cmake/thirdparty/get_nlohmann_json.cmake
 create mode 100644 cpp/cuann_bench/CMakeLists.txt
 create mode 100644 cpp/cuann_bench/README.md
 create mode 100644 cpp/cuann_bench/conf/bigann-100M.json
 create mode 100644 cpp/cuann_bench/conf/bigann-1B.json
 create mode 100644 cpp/cuann_bench/conf/deep-100M.fp16.json
 create mode 100644 cpp/cuann_bench/conf/deep-100M.json
 create mode 100644 cpp/cuann_bench/conf/deep-1B.json
 create mode 100644 cpp/cuann_bench/conf/glove-100-inner.json
 create mode 100644 cpp/cuann_bench/conf/sift-128-euclidean.json
 create mode 100755 cpp/cuann_bench/scripts/eval.pl
 create mode 100755 cpp/cuann_bench/scripts/fbin_to_f16bin.py
 create mode 100755 cpp/cuann_bench/scripts/hdf5_to_fbin.py
 create mode 100755 cpp/cuann_bench/scripts/split_groundtruth.pl
 create mode 100644 cpp/cuann_bench/src/ann.h
 create mode 100644 cpp/cuann_bench/src/benchmark.cu
 create mode 100644 cpp/cuann_bench/src/conf.cpp
 create mode 100644 cpp/cuann_bench/src/conf.h
 create mode 100644 cpp/cuann_bench/src/cudart_util.h
 create mode 100644 cpp/cuann_bench/src/dataset.h
 create mode 100644 cpp/cuann_bench/src/factory.cuh
 create mode 100644 cpp/cuann_bench/src/faiss_wrapper.h
 create mode 100644 cpp/cuann_bench/src/ggnn_wrapper.cuh
 create mode 100644 cpp/cuann_bench/src/hnswlib_wrapper.h
 create mode 100644 cpp/cuann_bench/src/multigpu.cuh
 create mode 100644 cpp/cuann_bench/src/raft_cuann_utils.h
 create mode 100644 cpp/cuann_bench/src/raft_ivf_flat.cu
 create mode 100644 cpp/cuann_bench/src/raft_ivf_flat_wrapper.h
 create mode 100644 cpp/cuann_bench/src/raft_ivf_pq.cu
 create mode 100644 cpp/cuann_bench/src/raft_ivf_pq_wrapper.h
 create mode 100644 cpp/cuann_bench/src/raft_wrapper.h
 create mode 100644 cpp/cuann_bench/src/util.cpp
 create mode 100644 cpp/cuann_bench/src/util.h
 create mode 100644 cpp/cuann_bench/third_party/patches/ggnn.patch
 create mode 100644 cpp/cuann_bench/third_party/patches/json.patch

diff --git a/build.sh b/build.sh
index 93f11d11a1..8731ec1020 100755
--- a/build.sh
+++ b/build.sh
@@ -2,23 +2,23 @@
 
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# raft build script
+# raft build scripts
 
-# This script is used to build the component(s) in this repo from
+# This scripts is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
 # build as needed (see the help output for details)
 
-# Abort script on first error
+# Abort scripts on first error
 set -e
 
 NUMARGS=$#
 ARGS=$*
 
 # NOTE: ensure all dir changes are relative to the location of this
-# script, and that this script resides in the repo dir!
+# scripts, and that this scripts resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean --uninstall  -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
+VALIDARGS="clean libraft pylibraft raft-dask docs tests bench cuann_bench clean --uninstall  -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench=<targets>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -29,6 +29,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    docs             - build the documentation
    tests            - build the tests
    bench            - build the benchmarks
+   cuann_bench      - build cuda ann benchmarks
 
  and <flag> is:
    -v                          - verbose build mode
@@ -62,13 +63,14 @@ RAFT_DASK_BUILD_DIR=${REPODIR}/python/raft-dask/_skbuild
 PYLIBRAFT_BUILD_DIR=${REPODIR}/python/pylibraft/_skbuild
 BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PYLIBRAFT_BUILD_DIR} ${RAFT_DASK_BUILD_DIR}"
 
-# Set defaults for vars modified by flags to this script
+# Set defaults for vars modified by flags to this scripts
 CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=OFF
 BUILD_TYPE=Release
 BUILD_BENCH=OFF
+BUILD_CUANN_BENCH=OFF
 BUILD_STATIC_FAISS=OFF
 COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
@@ -336,6 +338,14 @@ if hasArg bench || (( ${NUMARGS} == 0 )); then
 
 fi
 
+if hasArg cuann_bench || (( ${NUMARGS} == 0 )); then
+    BUILD_CUANN_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};CUANN_BENCH"
+    ENABLE_NN_DEPENDENCIES=ON
+    COMPILE_NN_LIBRARY=ON
+    COMPILE_DIST_LIBRARY=ON
+fi
+
 if hasArg --buildfaiss; then
     BUILD_STATIC_FAISS=ON
 fi
@@ -349,8 +359,6 @@ if hasArg clean; then
     CLEAN=1
 fi
 
-
-
 if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
@@ -384,7 +392,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench || hasArg cuann_bench; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -405,6 +413,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
           -DBUILD_BENCH=${BUILD_BENCH} \
+          -DBUILD_CUANN_BENCH=${BUILD_CUANN_BENCH} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
@@ -429,7 +438,6 @@ if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
     if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
         EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
     fi
-
     cd ${REPODIR}/python/raft-dask
     python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
@@ -443,7 +451,6 @@ if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
     if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
         EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
     fi
-
     cd ${REPODIR}/python/pylibraft
     python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7e5b10b227..6c02cb2523 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -47,6 +47,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 option(BUILD_SHARED_LIBS "Build raft shared libraries" ON)
 option(BUILD_TESTS "Build raft unit-tests" ON)
 option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
+option(BUILD_CUANN_BENCH "Build raft ann benchmarks" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -739,3 +740,10 @@ endif()
 if(BUILD_BENCH)
   include(bench/CMakeLists.txt)
 endif()
+
+# ##################################################################################################
+# * build cuann benchmark executable -----------------------------------------------
+
+if(BUILD_CUANN_BENCH)
+  include(cuann_bench/CMakeLists.txt)
+endif()
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
new file mode 100644
index 0000000000..a448ae0078
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -0,0 +1,37 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ggnn)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
+    IF ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/ggnn-src)
+        execute_process (
+                COMMAND git clone "https://github.com/${PKG_FORK}/ggnn" --branch ${PKG_PINNED_TAG} ggnn-src
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ )
+    endif ( )
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_ggnn(VERSION          0.5
+        FORK             cgtuebingen
+        PINNED_TAG       release_0.5
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
new file mode 100644
index 0000000000..d4ebaf0729
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -0,0 +1,37 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_hnswlib)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
+
+    IF ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src )
+        execute_process (
+                COMMAND git clone "https://github.com/${PKG_FORK}/hnswlib" --branch ${PKG_PINNED_TAG} hnswlib-src
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ )
+    endif ()
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_hnswlib(VERSION  0.6.2
+        FORK             nmslib
+        PINNED_TAG       v0.6.2
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
new file mode 100644
index 0000000000..109bdc03d4
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
@@ -0,0 +1,39 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_nlohmann_json)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(nlohmann_json ${PKG_VERSION}
+            GLOBAL_TARGETS      nlohmann_json::nlohmann_json
+            BUILD_EXPORT_SET    cuann_bench-exports
+            INSTALL_EXPORT_SET  cuann_bench-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/json.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL})
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_nlohmann_json(VERSION  3.11.2
+        FORK             nlohmann
+        PINNED_TAG       v3.11.2
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cuann_bench/CMakeLists.txt b/cpp/cuann_bench/CMakeLists.txt
new file mode 100644
index 0000000000..a51417b44b
--- /dev/null
+++ b/cpp/cuann_bench/CMakeLists.txt
@@ -0,0 +1,121 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+option(RAFT_CUANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf flat algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
+
+set(RAFT_CUANN_BENCH_USE_FAISS OFF)
+if(RAFT_CUANN_BENCH_USE_FAISS_BFKNN
+   OR RAFT_CUANN_BENCH_USE_FAISS_IVFPQ
+   OR RAFT_CUANN_BENCH_USE_FAISS_IFFLAT
+)
+  set(RAFT_CUANN_BENCH_USE_FAISS ON)
+endif()
+
+if(RAFT_CUANN_BENCH_USE_HNSWLIB)
+  include(cmake/thirdparty/get_hnswlib.cmake)
+endif()
+
+set(RAFT_CUANN_BENCH_USE_RAFT OFF)
+if(RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+   OR RAFT_CUANN_BENCH_USE_RAFT_IVFPQ
+   OR RAFT_CUANN_BENCH_USE_RAFT_IFFLAT
+)
+  set(RAFT_CUANN_BENCH_USE_RAFT ON)
+endif()
+
+option(RAFT_CUANN_BENCH_USE_MULTIGPU "Use multi-gpus (where possible) in benchmarks" OFF)
+
+include(cmake/thirdparty/get_nlohmann_json.cmake)
+
+if(RAFT_CUANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_ggnn.cmake)
+endif()
+
+if(RAFT_CUANN_BENCH_USE_FAISS)
+  include(cmake/thirdparty/get_faiss.cmake)
+endif()
+
+add_executable(
+  CUANN_BENCH cuann_bench/src/benchmark.cu cuann_bench/src/raft_ivf_flat.cu
+              cuann_bench/src/raft_ivf_pq.cu cuann_bench/src/util.cpp
+)
+
+target_link_libraries(
+  CUANN_BENCH
+  PRIVATE raft::raft
+          nlohmann_json::nlohmann_json
+          raft_internal
+          $<$<BOOL:${RAFT_CUANN_BENCH_USE_RAFT}>:raft::distance>
+          $<$<BOOL:${RAFT_CUANN_BENCH_USE_RAFT}>:raft::nn>
+          $<$<BOOL:${RAFT_CUANN_BENCH_USE_FAISS}>:faiss::faiss>
+          $<$<BOOL:${RAFT_CUANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
+          # $<$<BOOL:${RAFT_CUANN_BENCH_USE_HNSWLIB}>:hnswlib>
+          Threads::Threads
+          $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+          $<TARGET_NAME_IF_EXISTS:conda_env>
+)
+
+set_target_properties(
+  CUANN_BENCH
+  PROPERTIES # set target compile options
+             INSTALL_RPATH "\$ORIGIN/../../../lib"
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON
+             POSITION_INDEPENDENT_CODE ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON
+)
+
+target_compile_options(
+  CUANN_BENCH PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+)
+
+target_compile_definitions(
+  CUANN_BENCH
+  PUBLIC RAFT_CUANN_BENCH_USE_FAISS_BFKNN=${RAFT_CUANN_BENCH_USE_FAISS_BFKNN}
+         RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT=${RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT}
+         RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ=${RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ}
+         RAFT_CUANN_BENCH_USE_RAFT_BFKNN=${RAFT_CUANN_BENCH_USE_RAFT_BFKNN}
+         RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT=${RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT}
+         RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ=${RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ}
+         RAFT_CUANN_BENCH_USE_HNSWLIB=${RAFT_CUANN_BENCH_USE_HNSWLIB}
+         RAFT_CUANN_BENCH_USE_GGNN=${RAFT_CUANN_BENCH_USE_GGNN}
+)
+
+target_include_directories(
+  CUANN_BENCH
+  PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+  PRIVATE
+    "$<BUILD_INTERFACE:$<$<BOOL:${RAFT_CUANN_BENCH_USE_GGNN}>:${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include>>"
+    "$<BUILD_INTERFACE:$<$<BOOL:${RAFT_CUANN_BENCH_USE_HNSWLIB}>:${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib>>"
+)
+
+install(
+  TARGETS CUANN_BENCH
+  COMPONENT testing
+  DESTINATION bin/CUANN_BENCH
+  EXCLUDE_FROM_ALL
+)
diff --git a/cpp/cuann_bench/README.md b/cpp/cuann_bench/README.md
new file mode 100644
index 0000000000..310f31b95b
--- /dev/null
+++ b/cpp/cuann_bench/README.md
@@ -0,0 +1,325 @@
+# cuANN - CUDA Approximate Nearest Neighbor (ANN) Search
+
+This project provides a benchmark program for various ANN search implementations. It's especially suitable for GPU implementations.
+
+## Developer Guide
+
+Please read [CONTRIBUTING.md](CONTRIBUTING.md) before writing code for this project.
+
+## Benchmark
+
+### Building
+Prerequisites for compiling the `benchmark` program:
+* CUDA >= 11.3
+* GCC >= 8.2 if RAFT is used (preferably GCC 9.5+)
+* NCCL >= 2.10 if multi-GPU support is enabled
+* FAISS (https://github.com/facebookresearch/faiss) if it's enabled
+* cmake in the search path of executable files, for automatically installing FAISS and glog (Google Logging Library, required by GGNN)
+* miscellaneous libraries that can be installed using Makefile
+
+#### installing misc. libraries
+Most of the libraries are optional, and they can be enabled by the CUANN_USE_XYZ flags in [benchmark/Makefile](benchmark/Makefile). They will be downloadad automatically.
+
+
+#### installing NCCL
+If `CUANN_USE_MULTI_GPU = 1` in `benchmark/Makefile`, NCCL is required.
+
+It's most convenient to install NCCL under `${cuann_path}/third_party/nccl/`, like using "O/S agnostic local installer" downloaded from https://developer.nvidia.com/nccl/nccl-download. Otherwise, may need to modify `CPPFLAGS` and `LDFLAGS` in `benchmark/Makefile` to add include and library paths.
+
+
+#### installing FAISS library
+FAISS can be installed in many ways:
+* If `CUANN_USE_RAPIDS_CONTAINER = 1` in `benchmark/Makefile`, RAPIDS Docker container already has FAISS installed
+* If `CUANN_USE_RAPIDS_CONTAINER = 0` in `benchmark/Makefile`, FAISS will be installed automatically with the Makefile. However, it reqiures a BLAS implementation is available by setting either environmental paths or Makefile flags.
+* Sometimes, FAISS has already been installed in the system and we want to use that. Beside modifying `FAISS_PATH` in `benchmark/Makefile`, we also need to prevent Makefile from installing FAISS again. For that, change the line `faiss: faiss/lib/libfaiss.so` in `third_parth/Makefile` to `faiss:`.
+
+For manual installation: need to install FAISS from source. See [Building from source](https://github.com/facebookresearch/faiss/blob/master/INSTALL.md#building-from-source) for detailed steps.
+
+It's most convenient to install FAISS under `${cuann_path}/third_party/faiss/`. Otherwise, may need to modify `FAISS_PATH` in `benchmark/Makefile`.
+
+An example of cmake build commands:
+```
+mkdir build && cd build
+cmake -DFAISS_ENABLE_GPU=ON \
+  -DFAISS_ENABLE_PYTHON=OFF -DBUILD_TESTING=OFF \
+  -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CUDA_ARCHITECTURES="70;75;80;86" \
+  -DCMAKE_INSTALL_PREFIX=${cuann_path}/third_party/faiss ..
+```
+
+
+
+
+#### compiling benchmark
+First, modify CUANN_USE_XXX flags at the top of benchmark/Makefile to enable desirable implementations. By default, none is enabled.
+
+Then, just run `cd benchmark && make -j`.
+
+By default, the `benchmark` program accepts dataset of `float` type. To use other type, change the line `using data_t = float;` in `benchmark/src/benchmark.cu` to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
+
+
+### Usage
+There are 4 steps to run the benchmark:
+1. prepare dataset
+2. build index
+3. search using built index
+4. evaluate result
+
+#### TL;DR
+A complete example:
+```
+# (1) prepare a dataset
+pip3 install numpy h5py # if they have not been installed already
+cd benchmark
+mkdir data && cd data
+wget http://ann-benchmarks.com/glove-100-angular.hdf5
+# option -n is used here to normalize vectors so cosine distance is converted
+# to inner product; don't use -n for l2 distance
+../../script/hdf5_to_fbin.py -n glove-100-angular.hdf5
+mkdir glove-100-inner
+mv glove-100-angular.base.fbin glove-100-inner/base.fbin
+mv glove-100-angular.query.fbin glove-100-inner/query.fbin
+mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin
+mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin
+cd ..
+
+# (2) build index
+./benchmark -b -i faiss_ivf_flat.nlist1024 conf/glove-100-inner.json
+
+# (3) search
+./benchmark -s -i faiss_ivf_flat.nlist1024 conf/glove-100-inner.json
+
+# (4) evaluate result
+../script/eval.pl \
+  -o result.csv \
+  data/glove-100-inner/groundtruth.neighbors.ibin \
+  result/glove-100-inner/faiss_ivf_flat
+
+# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool
+```
+
+
+#### step 1: preparing dataset
+A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation.
+
+The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
+These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
+
+Some implementation, like Cagra, can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+
+Commonly used datasets can be downloaded from two websites:
+1.  Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
+
+    However, these datasets are in HDF5 format. Use `script/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    ```
+    pip3 install numpy h5py
+    ```
+    The usage of this script is:
+    ```
+    $ script/hdf5_to_fbin.py
+    usage: script/hdf5_to_fbin.py [-n] <input>.hdf5
+       -n: normalize base/query set
+     outputs: <input>.base.fbin
+              <input>.query.fbin
+              <input>.groundtruth.neighbors.ibin
+              <input>.groundtruth.distances.fbin
+    ```
+    So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset.
+
+    Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset.
+
+
+2.  Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
+    ```
+    $ script/split_groundtruth.pl
+    usage: script/split_groundtruth.pl input output_prefix
+    ```
+    Take Deep-1B dataset as an example:
+    ```
+    cd benchmark
+    mkdir -p data/deep-1B && cd data/deep-1B
+    # download manually "Ground Truth" file of "Yandex DEEP"
+    # suppose the file name is deep_new_groundtruth.public.10K.bin
+    ../../../script/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
+    ```
+    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
+
+
+#### step 2: building index
+An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `benchmark -b` to build an index and save it to disk.
+
+
+To run `benchmark`, a JSON configuration file is required. Refer to [`benchmark/conf/glove-100-inner.json`](conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
+* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since `benchmark` program is for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
+    - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset.
+* `search_basic_param` section specifies basic parameters for searching:
+    - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching.
+    -  `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs.
+* `index` section specifies an array of configurations for index building and searching:
+    - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values.
+    - `file` is the file name of index. Building will save built index to this file, while searching will load this file.
+    - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files.
+    - if `multigpu` is specified, multiple GPUs will be used for index build and search.
+    - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept.
+
+
+
+The usage of `benchmark` can be found by running `benchmark -h`:
+```
+$ ./benchmark -h
+usage: ./benchmark -b|s [-f] [-i index_names] conf.json
+   -b: build mode, will build index
+   -s: search mode, will search using built index
+       one and only one of -b and -s should be specified
+   -f: force overwriting existing output files
+   -i: by default will build/search all the indices found in conf.json
+       '-i' can be used to select a subset of indices
+       'index_names' is a list of comma-separated index names
+       '*' is allowed as the last character of a name to select all matched indices
+       for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss"
+```
+* `-b`: build index.
+* `-s`: do the searching with built index.
+* `-f`: before doing the real task, `benchmark` checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option.
+* `-i`: by default, `benchmark -b` will build all indices found in the configuration file, and `benchmark -s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option.
+
+It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains:
+```
+  "index" : [
+    {
+      "name" : "hnsw1",
+      ...
+    },
+    {
+      "name" : "hnsw1",
+      ...
+    },
+    {
+      "name" : "faiss",
+      ...
+    }
+  ]
+```
+Then,
+```
+# build all indices: hnsw1, hnsw2 and faiss
+./benchmark -b a.json
+
+# build only hnsw1
+./benchmark -b -i hnsw1 a.json
+
+# build hnsw1 and hnsw2
+./benchmark -b -i hnsw1,hnsw2 a.json
+
+# build hnsw1 and hnsw2
+./benchmark -b -i 'hnsw*' a.json
+
+# build hnsw1, hnsw2 and faiss
+./benchmark -b -i 'hnsw*,faiss' a.json
+```
+In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell.
+
+
+#### step 3: searching
+Use `benchmark -s`. Other options are the same as in step 2.
+
+
+#### step 4: evaluating results
+Use `script/eval.pl` to evaluate benchmark results. The usage is:
+```
+$ script/eval.pl
+usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
+  result_paths... are paths to the search result files.
+    Can specify multiple paths.
+    For each of them, if it's a directory, all the .txt files found under
+    it recursively will be regarded as inputs.
+
+  -f: force to recompute recall and update it in result file if needed
+  -o: also write result to a csv file
+```
+Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files.
+An example:
+```
+script/eval.pl groundtruth.neighbors.ibin \
+  result/glove-100-angular/10/hnsw/angular_M_24_*.txt \
+  result/glove-100-angular/10/faiss/
+```
+The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively.
+
+This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively.
+
+It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o <output.csv>` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves.
+
+
+
+## How to add a new ANN algorithm
+Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `include/cuann/ann.h`) and implements all the pure virtual functions.
+
+In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
+```
+template<typename T>
+class HnswLib : public ANN<T> {
+public:
+  struct BuildParam {
+    int M;
+    int ef_construction;
+    int num_threads;
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads;
+  };
+
+  // ...
+};
+```
+
+The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example:
+```
+{
+  "name" : "...",
+  "algo" : "hnswlib",
+  "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+  "file" : "...",
+  "search_params" : [
+    {"ef":10, "numThreads":1},
+    {"ef":20, "numThreads":1},
+    {"ef":40, "numThreads":1},
+  ],
+  "search_result_file" : "..."
+},
+```
+
+How to interpret these JSON objects is totally left to the implementation and should be specified in `benchmark/src/factory.cuh`:
+* First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
+    ```
+    template<typename T>
+    void parse_build_param(const nlohmann::json& conf,
+                           typename cuann::HnswLib<T>::BuildParam& param) {
+      param.ef_construction = conf.at("efConstruction");
+      param.M = conf.at("M");
+      if (conf.contains("numThreads")) {
+        param.num_threads = conf.at("numThreads");
+      }
+    }
+
+    template<typename T>
+    void parse_search_param(const nlohmann::json& conf,
+                            typename cuann::HnswLib<T>::SearchParam& param) {
+      param.ef = conf.at("ef");
+      if (conf.contains("numThreads")) {
+        param.num_threads = conf.at("numThreads");
+      }
+    }
+    ```
+
+* Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
+    ```
+      // JSON configuration file contains a line like:  "algo" : "hnswlib"
+      if (algo == "hnswlib") {
+         // ...
+      }
+    ```
diff --git a/cpp/cuann_bench/conf/bigann-100M.json b/cpp/cuann_bench/conf/bigann-100M.json
new file mode 100644
index 0000000000..d6c3a12f51
--- /dev/null
+++ b/cpp/cuann_bench/conf/bigann-100M.json
@@ -0,0 +1,342 @@
+{
+  "dataset" : {
+    "name" : "bigann-100M",
+    "base_file" : "data/bigann-1B/base.1B.u8bin",
+    "subset_size" : 100000000,
+    "query_file" : "data/bigann-1B/query.public.10K.u8bin",
+    "distance" : "euclidean"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster5K-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "niter": 25,
+        "nlist": 5000,
+        "pq_dim": 64,
+        "ratio": 10
+      },
+      "file": "index/bigann-100M/raft_ivf_pq/dimpq64-cluster5K",
+      "search_params": [
+        {
+          "numProbes": 20,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 30,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 40,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 1000,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/bigann-100M/raft_ivf_pq/dimpq64-cluster5K-float-float"
+    },
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M36"
+    },
+
+
+    {
+      "name" : "ivf_flat.nlist100K",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist" : 100000,
+        "niter" : 25,
+        "ratio" : 5
+      },
+      "file" : "index/bigann-100M/ivf_flat/nlist100K",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":20},
+        {"max_batch":10000, "max_k":10, "nprobe":30},
+        {"max_batch":10000, "max_k":10, "nprobe":40},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/ivf_flat/nlist100K"
+    },
+
+
+    {
+      "name" : "libcuann.dimpq48-cluster50K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 48,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq48-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq48-cluster50K.refine2"
+    },
+
+
+    {
+      "name" : "libcuann.dimpq64-cluster50K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 64,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq64-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq64-cluster50K.refine2"
+    },
+
+    {
+      "name" : "libcuann.dimpq64-5bit-cluster50K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 64,
+        "bitPq" : 5,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq64-5bit-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq64-5bit-cluster50K.refine4"
+    },
+
+    {
+      "name" : "libcuann.dimpq72-cluster50K",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 72,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq72-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq72-cluster50K"
+    },
+
+    {
+      "name" : "libcuann.dimpq72-cluster50K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 72,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq72-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq72-cluster50K.refine2"
+    },
+
+    {
+      "name" : "libcuann.dimpq96-cluster50K",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 96,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq96-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq96-cluster50K"
+    },
+
+    {
+      "name" : "libcuann.dimpq96-cluster50K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 96,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-100M/libcuann/dimpq96-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/libcuann/dimpq96-cluster50K.refine2"
+    }
+
+
+  ]
+}
diff --git a/cpp/cuann_bench/conf/bigann-1B.json b/cpp/cuann_bench/conf/bigann-1B.json
new file mode 100644
index 0000000000..3f9c8e4457
--- /dev/null
+++ b/cpp/cuann_bench/conf/bigann-1B.json
@@ -0,0 +1,139 @@
+{
+  "dataset" : {
+    "name" : "bigann-1B",
+    "base_file" : "data/bigann-1B/base.1B.u8bin",
+    "query_file" : "data/bigann-1B/query.public.10K.u8bin",
+    "distance" : "euclidean"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "libcuann.dimpq32-cluster100K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 32,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-1B/libcuann/dimpq32-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-1B/libcuann/dimpq32-cluster100K.refine4"
+    },
+
+    {
+      "name" : "libcuann.dimpq48-cluster100K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 48,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-1B/libcuann/dimpq48-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-1B/libcuann/dimpq48-cluster100K.refine2"
+    },
+
+    {
+      "name" : "libcuann.dimpq64-cluster100K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 64,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-1B/libcuann/dimpq64-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-1B/libcuann/dimpq64-cluster100K.refine2"
+    },
+
+    {
+      "name" : "libcuann.dimpq64-5bit-cluster100K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 64,
+        "bitPq" : 5,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-1B/libcuann/dimpq64-5bit-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-1B/libcuann/dimpq64-5bit-cluster100K.refine4"
+    },
+
+    {
+      "name" : "libcuann.dimpq64-5bit-cluster250K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 250000,
+        "dimPq" : 64,
+        "bitPq" : 5,
+        "randomRotation" : true
+      },
+      "file" : "index/bigann-1B/libcuann/dimpq64-5bit-cluster250K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/bigann-1B/libcuann/dimpq64-5bit-cluster250K.refine4"
+    }
+
+  ]
+}
diff --git a/cpp/cuann_bench/conf/deep-100M.fp16.json b/cpp/cuann_bench/conf/deep-100M.fp16.json
new file mode 100644
index 0000000000..18fb75e8e9
--- /dev/null
+++ b/cpp/cuann_bench/conf/deep-100M.fp16.json
@@ -0,0 +1,50 @@
+{
+  "dataset" : {
+    "name" : "deep-100M-fp16",
+    "base_file" : "data/deep-1B/base.1B.f16bin",
+    "subset_size" : 100000000,
+    "query_file" : "data/deep-1B/query.public.10K.f16bin",
+    // although distance should be "euclidean", faiss becomes much slower for that
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "cagra-fp16.k64",
+      "algo" : "cagra",
+      "build_param": {
+      },
+      "file" : "index/deep-100M/cagra/k96.pruned.k64",
+      "search_params": [
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 32 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 64 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 96 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 128 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 160 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 192 }
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":30 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":40 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":50 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":60 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":30 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":40 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":50 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":60 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 96 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 128 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 160 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 192 }
+      ],
+      "search_result_file" : "result/deep-100M/cagra-fp16/k64"
+    }
+
+  ]
+}
diff --git a/cpp/cuann_bench/conf/deep-100M.json b/cpp/cuann_bench/conf/deep-100M.json
new file mode 100644
index 0000000000..af699c1f50
--- /dev/null
+++ b/cpp/cuann_bench/conf/deep-100M.json
@@ -0,0 +1,521 @@
+{
+  "dataset" : {
+    "name" : "deep-100M",
+    "base_file" : "data/deep-1B/base.1B.fbin",
+    "subset_size" : 100000000,
+    "query_file" : "data/deep-1B/query.public.10K.fbin",
+    // although distance should be "euclidean", faiss becomes much slower for that
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M36"
+    },
+
+
+    {
+      "name" : "cugann.K64",
+      "algo" : "cugann",
+      "build_param": {
+        "K": 64,
+        "build_mode": "fast",
+        "max_edge_num": 160,
+        "rank_threshold": 6,
+        "long_edge_threshold": 1.3
+      },
+      "file" : "index/deep-100M/cugann/K64",
+      "search_params": [
+        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 7 }
+      ],
+      "search_result_file" : "result/deep-100M/cugann/K64"
+    },
+    {
+      "name" : "cugann.K64-bulk",
+      "algo" : "cugann",
+      "build_param": {
+        "K": 64,
+        "build_mode": "fast",
+        "max_edge_num": 160,
+        "rank_threshold": 6,
+        "long_edge_threshold": 1.3
+      },
+      "file" : "index/deep-100M/cugann/K64",
+      "search_params": [
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 24 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 32 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 38 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 48 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 54 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 64 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 76 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 84 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 90 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 96 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 104 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 110 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 120 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 128 }
+      ],
+      "search_result_file" : "result/deep-100M/cugann/K64-bulk"
+    },
+
+
+    {
+      "name" : "cagra.k64",
+      "algo" : "cagra",
+      "build_param": {
+      },
+      "file" : "index/deep-100M/cagra/k96.pruned.k64",
+      "search_params": [
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 32 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 64 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 96 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 128 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 160 },
+        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 192 }
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":30 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":40 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":50 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":60 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":30 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":40 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":50 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":60 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 96 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 128 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 160 },
+        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 192 }
+      ],
+      "search_result_file" : "result/deep-100M/cagra/k64"
+    },
+
+
+    {
+      "name" : "faiss_ivf_flat.nlist50K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":50000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist50K"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist100K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":100000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist100K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist100K"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist200K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":200000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist200K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist200K"
+    },
+
+
+    {
+      "name" : "faiss_ivf_pq.M48-nlist16K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist16K",
+      "search_params" : [
+        {"nprobe":10},
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist16K"
+    },
+    {
+      "name" : "faiss_ivf_pq.M48-nlist50K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":50000, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist50K"
+    },
+    {
+      "name" : "faiss_ivf_pq.M48-nlist100K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":100000, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist100K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist100K"
+    },
+
+
+    {
+      "name" : "ivf_flat.nlist100K",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist" : 100000,
+        "niter" : 25,
+        "ratio" : 5
+      },
+      "file" : "index/deep-100M/ivf_flat/nlist100K",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":20},
+        {"max_batch":10000, "max_k":10, "nprobe":30},
+        {"max_batch":10000, "max_k":10, "nprobe":40},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/ivf_flat/nlist100K"
+    },
+
+
+    {
+      "name" : "libcuann.dimpq48-cluster50K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 48
+      },
+      "file" : "index/deep-100M/libcuann/dimpq48-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/libcuann/dimpq48-cluster50K.refine2"
+    },
+    {
+      "name" : "libcuann.dimpq48-5bit-cluster50K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 48,
+        "bitPq" : 5
+      },
+      "file" : "index/deep-100M/libcuann/dimpq48-5bit-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/libcuann/dimpq48-5bit-cluster50K.refine4"
+    },
+    {
+      "name" : "libcuann.dimpq64-cluster50K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 64
+      },
+      "file" : "index/deep-100M/libcuann/dimpq64-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/libcuann/dimpq64-cluster50K.refine2"
+    },
+    {
+      "name" : "libcuann.dimpq64-5bit-cluster50K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 64,
+        "bitPq" : 5
+      },
+      "file" : "index/deep-100M/libcuann/dimpq64-5bit-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/libcuann/dimpq64-5bit-cluster50K.refine4"
+    },
+    {
+      "name" : "libcuann.dimpq72-cluster50K",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 72
+      },
+      "file" : "index/deep-100M/libcuann/dimpq72-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/libcuann/dimpq72-cluster50K"
+    },
+    {
+      "name" : "libcuann.dimpq96-cluster50K",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 100000000,
+        "numClusters" : 50000,
+        "dimPq" : 96
+      },
+      "file" : "index/deep-100M/libcuann/dimpq96-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/libcuann/dimpq96-cluster50K"
+    },
+
+
+    // the following multigpu configurations are for validating correctness, not for measuring performance
+    {
+      "name" : "multigpu.faiss_ivf_flat.nlist10K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "build_param": {"nlist":10000},
+      "file" : "index/deep-100M/multigpu/faiss_ivf_flat.nlist10K",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/multigpu/faiss_ivf_flat.nlist10K"
+    },
+
+    {
+      "name" : "multigpu.libcuann.dimpq72-cluster10K",
+      "algo" : "libcuann",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "build_param": {
+        "numDataset" : 12500000,
+        "numClusters" : 10000,
+        "dimPq" : 72
+      },
+      "file" : "index/deep-100M/multigpu/libcuann.dimpq72-cluster10K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/multigpu/libcuann.dimpq72-cluster10K"
+    },
+
+    {
+      "name" : "multigpu.libcuann.dimpq72-cluster10K.refine2",
+      "algo" : "libcuann",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 12500000,
+        "numClusters" : 10000,
+        "dimPq" : 72
+      },
+      "file" : "index/deep-100M/multigpu/libcuann.dimpq72-cluster10K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-100M/multigpu/libcuann.dimpq72-cluster10K.refine2"
+    }
+
+  ]
+}
diff --git a/cpp/cuann_bench/conf/deep-1B.json b/cpp/cuann_bench/conf/deep-1B.json
new file mode 100644
index 0000000000..8218ef30bd
--- /dev/null
+++ b/cpp/cuann_bench/conf/deep-1B.json
@@ -0,0 +1,304 @@
+{
+  "dataset" : {
+    "name" : "deep-1B",
+    "base_file" : "data/deep-1B/base.1B.fbin",
+    "query_file" : "data/deep-1B/query.public.10K.fbin",
+    // although distance should be "euclidean", faiss becomes much slower for that
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "faiss_ivf_pq.M48-nlist50K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":50000, "M":48},
+      "file" : "index/deep-1B/faiss_ivf_pq/M48-nlist50K",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/deep-1B/faiss_ivf_pq/M48-nlist50K"
+    },
+
+
+    {
+      "name" : "libcuann.dimpq48-cluster100K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 48
+      },
+      "file" : "index/deep-1B/libcuann/dimpq48-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/libcuann/dimpq48-cluster100K.refine2"
+    },
+    {
+      "name" : "libcuann.dimpq48-5bit-cluster100K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 48,
+        "bitPq" : 5
+      },
+      "file" : "index/deep-1B/libcuann/dimpq48-5bit-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/libcuann/dimpq48-5bit-cluster100K.refine4"
+    },
+    {
+      "name" : "libcuann.dimpq64-cluster100K.refine2",
+      "algo" : "libcuann",
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 64
+      },
+      "file" : "index/deep-1B/libcuann/dimpq64-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/libcuann/dimpq64-cluster100K.refine2"
+    },
+    {
+      "name" : "libcuann.dimpq64-5bit-cluster100K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 64,
+        "bitPq" : 5
+      },
+      "file" : "index/deep-1B/libcuann/dimpq64-5bit-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/libcuann/dimpq64-5bit-cluster100K.refine4"
+    },
+    {
+      "name" : "libcuann.dimpq64-5bit-cluster250K.refine4",
+      "algo" : "libcuann",
+      "refine_ratio" : 4,
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 250000,
+        "dimPq" : 64,
+        "bitPq" : 5
+      },
+      "file" : "index/deep-1B/libcuann/dimpq64-5bit-cluster250K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/libcuann/dimpq64-5bit-cluster250K.refine4"
+    },
+
+    {
+      "name" : "libcuann.dimpq72-cluster100K",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1000000000,
+        "numClusters" : 100000,
+        "dimPq" : 72
+      },
+      "file" : "index/deep-1B/libcuann/dimpq72-cluster100K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/libcuann/dimpq72-cluster100K"
+    },
+    // libcuann.dimpq96-cluster100K: index size 94GB, >80GB, so becomes slow
+
+
+    {
+      "name" : "multigpu.faiss_ivf_flat.nlist50K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "build_param": {"nlist":50000},
+      "file" : "index/deep-1B/multigpu/faiss_ivf_flat.nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-1B/multigpu/faiss_ivf_flat.nlist50K"
+    },
+
+    {
+      "name" : "multigpu.libcuann.dimpq48-cluster50K.refine2",
+      "algo" : "libcuann",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "refine_ratio" : 2,
+      "build_param": {
+        "numDataset" : 125000000,
+        "numClusters" : 50000,
+        "dimPq" : 48
+      },
+      "file" : "index/deep-1B/multigpu/libcuann.dimpq48-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/multigpu/libcuann.dimpq48-cluster50K.refine2"
+    },
+    {
+      "name" : "multigpu.libcuann.dimpq96-cluster50K",
+      "algo" : "libcuann",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "build_param": {
+        "numDataset" : 125000000,
+        "numClusters" : 50000,
+        "dimPq" : 96
+      },
+      "file" : "index/deep-1B/multigpu/libcuann.dimpq96-cluster50K",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":20},
+        {"max_batch_size":10000, "k":10, "numProbes":30},
+        {"max_batch_size":10000, "k":10, "numProbes":40},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/deep-1B/multigpu/libcuann.dimpq96-cluster50K"
+    },
+
+    {
+      "name" : "multigpu.cugann.K64",
+      "algo" : "cugann",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "build_param": {
+        "K": 64,
+        "build_mode": "fast",
+        "max_edge_num": 160,
+        "rank_threshold": 6,
+        "long_edge_threshold": 1.3
+      },
+      "file" : "index/deep-1B/multigpu/cugann.K64",
+      "search_params": [
+        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 7 },
+        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 7 }
+      ],
+      "search_result_file" : "result/deep-1B/multigpu/cugann.K64"
+    },
+    {
+      "name" : "multigpu.cugann.K64-bulk",
+      "algo" : "cugann",
+      "multigpu" : [0,1,2,3,4,5,6,7],
+      "build_param": {
+        "K": 64,
+        "build_mode": "fast",
+        "max_edge_num": 160,
+        "rank_threshold": 6,
+        "long_edge_threshold": 1.3
+      },
+      "file" : "index/deep-1B/multigpu/cugann.K64",
+      "search_params": [
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 24 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 32 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 38 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 48 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 54 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 64 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 76 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 84 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 90 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 96 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 104 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 110 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 120 },
+        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 128 }
+      ],
+      "search_result_file" : "result/deep-1B/multigpu/cugann.K64-bulk"
+    }
+
+  ]
+}
diff --git a/cpp/cuann_bench/conf/glove-100-inner.json b/cpp/cuann_bench/conf/glove-100-inner.json
new file mode 100644
index 0000000000..887190683f
--- /dev/null
+++ b/cpp/cuann_bench/conf/glove-100-inner.json
@@ -0,0 +1,1506 @@
+{
+  "dataset" : {
+    "name" : "glove-100-inner",
+    "base_file" : "data/glove-100-inner/base.fbin",
+    "query_file" : "data/glove-100-inner/query.fbin",
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 1,
+    "k" : 10,
+    "run_count" : 3
+  },
+
+  "index" : [
+    {
+      "name" : "hnswlib.M4",
+      "algo" : "hnswlib",
+      "build_param": {"M":4, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M4",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M4"
+    },
+
+    {
+      "name" : "hnswlib.M8",
+      "algo" : "hnswlib",
+      "build_param": {"M":8, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M8",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M8"
+    },
+
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M12"
+    },
+
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M16"
+    },
+
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M24"
+    },
+
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M36"
+    },
+
+    {
+      "name" : "hnswlib.M48",
+      "algo" : "hnswlib",
+      "build_param": {"M":48, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M48",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M48"
+    },
+
+    {
+      "name" : "hnswlib.M64",
+      "algo" : "hnswlib",
+      "build_param": {"M":64, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M64",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M64"
+    },
+
+    {
+      "name" : "hnswlib.M96",
+      "algo" : "hnswlib",
+      "build_param": {"M":96, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M96",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M96"
+    },
+
+
+    {
+      "name" : "cuhnsw.M4",
+      "algo" : "cuhnsw",
+      "build_param": {"M":4, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M4",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M4"
+    },
+
+    {
+      "name" : "cuhnsw.M8",
+      "algo" : "cuhnsw",
+      "build_param": {"M":8, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M8",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M8"
+    },
+
+    {
+      "name" : "cuhnsw.M12",
+      "algo" : "cuhnsw",
+      "build_param": {"M":12, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M12",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M12"
+    },
+
+    {
+      "name" : "cuhnsw.M16",
+      "algo" : "cuhnsw",
+      "build_param": {"M":16, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M16",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M16"
+    },
+
+    {
+      "name" : "cuhnsw.M24",
+      "algo" : "cuhnsw",
+      "build_param": {"M":24, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M24",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M24"
+    },
+
+    {
+      "name" : "cuhnsw.M36",
+      "algo" : "cuhnsw",
+      "build_param": {"M":36, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M36",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M36"
+    },
+
+    {
+      "name" : "cuhnsw.M48",
+      "algo" : "cuhnsw",
+      "build_param": {"M":48, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M48",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M48"
+    },
+
+    {
+      "name" : "cuhnsw.M64",
+      "algo" : "cuhnsw",
+      "build_param": {"M":64, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M64",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M64"
+    },
+
+    {
+      "name" : "cuhnsw.M96",
+      "algo" : "cuhnsw",
+      "build_param": {"M":96, "efConstruction":500, "block_dim":64},
+      "file" : "index/glove-100-inner/cuhnsw/M96",
+      "search_params" : [
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
+      ],
+      "search_result_file" : "result/glove-100-inner/cuhnsw/M96"
+    },
+
+    {
+      "name" : "cugann.K224",
+      "algo" : "cugann",
+      "build_param" : {
+        "K": 224,
+        "build_mode": "fast",
+        "max_edge_num": 160,
+        "rank_threshold": 6,
+        "long_edge_threshold": 1.3
+      },
+      "file" : "index/glove-100-inner/cugann/K224",
+      "search_params": [
+        { "search_mode": "fast", "searcher_num": 8,  "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 10, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 14, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 16, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 20, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 26, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 32, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 36, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 40, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 44, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 48, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 52, "iteration_num": 4 },
+        { "search_mode": "fast", "searcher_num": 8,  "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 10, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 14, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 16, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 20, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 26, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 32, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 36, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 40, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 44, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 48, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 52, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 56, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 60, "iteration_num": 5 },
+        { "search_mode": "fast", "searcher_num": 32, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 34, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 50, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 64, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 96, "iteration_num": 6 },
+        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 6 }
+      ],
+      "search_result_file" : "result/glove-100-inner/cugann/K224"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist1024",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":1024},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist2048",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":2048},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist4096",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":4096},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist8192",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":8192},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist16384",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":16384},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist16384"
+    },
+
+
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist16384"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist16384"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist16384"
+    },
+
+
+    {
+      "name" : "faiss_ivf_sq.nlist1024-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":1024, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist2048-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":2048, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist4096-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":4096, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist8192-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":8192, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist16384-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":16384, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+
+
+    {
+      "name" : "faiss_ivf_sq.nlist1024-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":1024, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist2048-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":2048, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist4096-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":4096, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist8192-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":8192, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist16384-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":16384, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+
+    {
+      "name" : "faiss_flat",
+      "algo" : "faiss_gpu_flat",
+      "build_param": {},
+      "file" : "index/glove-100-inner/faiss_flat/flat",
+      "search_params" : [{}],
+      "search_result_file" : "result/glove-100-inner/faiss_flat/flat"
+    },
+
+    {
+      "name" : "ggnn.kbuild96-segment64-refine2-k10",
+      "algo" : "ggnn",
+      "build_param": {
+        "k_build": 96,
+        "segment_size": 64,
+        "refine_iterations": 2,
+        "dataset_size": 1183514,
+        "k": 10
+      },
+      "file" : "index/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10",
+      "search_params" : [
+        {"tau":0.001, "block_dim":64, "sorted_size":32},
+        {"tau":0.005, "block_dim":64, "sorted_size":32},
+        {"tau":0.01,  "block_dim":64, "sorted_size":32},
+        {"tau":0.02,  "block_dim":64, "sorted_size":32},
+        {"tau":0.03,  "block_dim":64, "sorted_size":32},
+        {"tau":0.04,  "block_dim":64, "sorted_size":32},
+        {"tau":0.05,  "block_dim":64, "sorted_size":32},
+        {"tau":0.06,  "block_dim":64, "sorted_size":32},
+        {"tau":0.09,  "block_dim":64, "sorted_size":32},
+        {"tau":0.12,  "block_dim":64, "sorted_size":32},
+        {"tau":0.18,  "block_dim":64, "sorted_size":32},
+        {"tau":0.21,  "block_dim":64, "sorted_size":32},
+        {"tau":0.24,  "block_dim":64, "sorted_size":32},
+        {"tau":0.27,  "block_dim":64, "sorted_size":32},
+        {"tau":0.3,   "block_dim":64, "sorted_size":32},
+        {"tau":0.4,   "block_dim":64, "sorted_size":32},
+        {"tau":0.01, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.02, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.03, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.04, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.05, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.06, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.09, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.12, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.18, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.21, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.24, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.27, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.3,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.4,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.5,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32}
+
+      ],
+      "search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10"
+    },
+
+
+    {
+      "name" : "libcuann.dimpq10-cluster1024",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 1024,
+        "dimPq" : 10
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster1024",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster1024"
+    },
+
+    {
+      "name" : "libcuann.dimpq10-cluster2048",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 2048,
+        "dimPq" : 10
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster2048",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster2048"
+    },
+
+    {
+      "name" : "libcuann.dimpq10-cluster4096",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 4096,
+        "dimPq" : 10
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster4096",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster4096"
+    },
+
+    {
+      "name" : "libcuann.dimpq10-cluster8192",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 8192,
+        "dimPq" : 10
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster8192",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster8192"
+    },
+
+    {
+      "name" : "libcuann.dimpq10-cluster16384",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 16384,
+        "dimPq" : 10
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster16384",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000},
+        {"max_batch_size":10000, "k":10, "numProbes":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster16384"
+    },
+
+    {
+      "name" : "libcuann.dimpq20-cluster1024",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 1024,
+        "dimPq" : 20
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster1024",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster1024"
+    },
+
+    {
+      "name" : "libcuann.dimpq20-cluster2048",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 2048,
+        "dimPq" : 20
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster2048",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster2048"
+    },
+
+    {
+      "name" : "libcuann.dimpq20-cluster4096",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 4096,
+        "dimPq" : 20
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster4096",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster4096"
+    },
+
+    {
+      "name" : "libcuann.dimpq20-cluster8192",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 8192,
+        "dimPq" : 20
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster8192",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster8192"
+    },
+
+    {
+      "name" : "libcuann.dimpq20-cluster16384",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 16384,
+        "dimPq" : 20
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster16384",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000},
+        {"max_batch_size":10000, "k":10, "numProbes":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster16384"
+    },
+
+    {
+      "name" : "libcuann.dimpq50-cluster1024",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 1024,
+        "dimPq" : 50
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster1024",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster1024"
+    },
+
+    {
+      "name" : "libcuann.dimpq50-cluster2048",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 2048,
+        "dimPq" : 50
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster2048",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster2048"
+    },
+
+    {
+      "name" : "libcuann.dimpq50-cluster4096",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 4096,
+        "dimPq" : 50
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster4096",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster4096"
+    },
+
+    {
+      "name" : "libcuann.dimpq50-cluster8192",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 8192,
+        "dimPq" : 50
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster8192",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster8192"
+    },
+
+    {
+      "name" : "libcuann.dimpq50-cluster16384",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 16384,
+        "dimPq" : 50
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster16384",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000},
+        {"max_batch_size":10000, "k":10, "numProbes":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster16384"
+    },
+
+    {
+      "name" : "libcuann.dimpq100-cluster1024",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 1024,
+        "dimPq" : 100
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster1024",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster1024"
+    },
+
+    {
+      "name" : "libcuann.dimpq100-cluster2048",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 2048,
+        "dimPq" : 100
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster2048",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster2048"
+    },
+
+    {
+      "name" : "libcuann.dimpq100-cluster4096",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 4096,
+        "dimPq" : 100
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster4096",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster4096"
+    },
+
+    {
+      "name" : "libcuann.dimpq100-cluster8192",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 8192,
+        "dimPq" : 100
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster8192",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster8192"
+    },
+
+    {
+      "name" : "libcuann.dimpq100-cluster16384",
+      "algo" : "libcuann",
+      "build_param": {
+        "numDataset" : 1183514,
+        "numClusters" : 16384,
+        "dimPq" : 100
+      },
+      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster16384",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000},
+        {"max_batch_size":10000, "k":10, "numProbes":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster16384"
+    },
+
+    {
+      "name" : "ivf_flat.nlist1024",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist":1024,
+        "ratio":2,
+        "niter":20
+      },
+      "file" : "index/glove-100-inner/ivf_flat/nlist1024",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":1},
+        {"max_batch":10000, "max_k":10, "nprobe":5},
+        {"max_batch":10000, "max_k":10, "nprobe":10},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/ivf_flat/nlist1024"
+    },
+
+
+    {
+      "name" : "multigpu-libcuann.dimpq100-cluster1024",
+      "algo" : "libcuann",
+      "multigpu" : [0, 1], 
+      "build_param": {
+        "numDataset" : 591757,
+        "numClusters" : 1024,
+        "dimPq" : 100
+      },
+      "file" : "index/glove-100-inner/multigpu/libcuann/dimpq100-cluster1024",
+      "search_params" : [
+        {"max_batch_size":10000, "k":10, "numProbes":1},
+        {"max_batch_size":10000, "k":10, "numProbes":5},
+        {"max_batch_size":10000, "k":10, "numProbes":10},
+        {"max_batch_size":10000, "k":10, "numProbes":50},
+        {"max_batch_size":10000, "k":10, "numProbes":100},
+        {"max_batch_size":10000, "k":10, "numProbes":200},
+        {"max_batch_size":10000, "k":10, "numProbes":500},
+        {"max_batch_size":10000, "k":10, "numProbes":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/multigpu/libcuann/dimpq100-cluster1024"
+    }
+  ]
+
+}
diff --git a/cpp/cuann_bench/conf/sift-128-euclidean.json b/cpp/cuann_bench/conf/sift-128-euclidean.json
new file mode 100644
index 0000000000..081d6cba2c
--- /dev/null
+++ b/cpp/cuann_bench/conf/sift-128-euclidean.json
@@ -0,0 +1,2023 @@
+{
+  "dataset": {
+    "name": "sift-128-euclidean",
+    "base_file": "/workspace/rapids/knn/cuann/benchmark/sift-128-euclidean/base.fbin",
+    "query_file": "/workspace/rapids/knn/cuann/benchmark/sift-128-euclidean/query.fbin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/sift-128-euclidean/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/sift-128-euclidean/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_flat/flat"
+    },
+    {
+      "name": "libcuann.dimpq128-cluster1024",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 128,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024"
+    },
+    {
+      "name": "libcuann.dimpq128-cluster1024-prof",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 128,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-prof"
+    },
+    {
+      "name": "libcuann.dimpq64-cluster1024-float-half",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 64,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "libcuann.dimpq64-cluster1024-float-fp8",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 64,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "libcuann.dimpq32-cluster1024-float-fp8",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 32,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "libcuann.dimpq16-cluster1024-float-fp8",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 16,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "libcuann.dimpq128-cluster1024-float-float",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 128,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "libcuann.dimpq128-cluster1024-float-half",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 128,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1000,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "libcuann.dimpq128-cluster1024-float-fp8",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 1024,
+        "dimPq": 128,
+        "sampleRatio": 1
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "libcuann.dimpq128-cluster16384",
+      "algo": "libcuann",
+      "build_param": {
+        "numDataset": 1000000,
+        "numClusters": 16384,
+        "dimPq": 128,
+        "sampleRatio": 2
+      },
+      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster16384",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 5
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1000
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster16384"
+    },
+    {
+      "name": "ivf_flat.nlist1024",
+      "algo": "ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 1
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 5
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 10
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 50
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 100
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 200
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 500
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/ivf_flat/nlist1024"
+    },
+    {
+      "name": "ivf_flat.nlist16384",
+      "algo": "ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/sift-128-euclidean/ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 1
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 5
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 10
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 50
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 100
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 200
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 500
+        },
+        {
+          "max_batch": 10000,
+          "max_k": 10,
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/ivf_flat/nlist16384"
+    },
+    {
+      "name": "multigpu-libcuann.dimpq128-cluster1024",
+      "algo": "libcuann",
+      "multigpu": [
+        0,
+        1
+      ],
+      "build_param": {
+        "numDataset": 591757,
+        "numClusters": 1024,
+        "dimPq": 128
+      },
+      "file": "index/sift-128-euclidean/multigpu/libcuann/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 5
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 10
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 50
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 100
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 200
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 500
+        },
+        {
+          "max_batch_size": 10000,
+          "k": 10,
+          "numProbes": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/multigpu/libcuann/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-prof",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-prof"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist16384"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/cpp/cuann_bench/scripts/eval.pl b/cpp/cuann_bench/scripts/eval.pl
new file mode 100755
index 0000000000..81c5563d79
--- /dev/null
+++ b/cpp/cuann_bench/scripts/eval.pl
@@ -0,0 +1,430 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+use File::Find;
+use Getopt::Std;
+
+my $QPS = 'QPS';
+my $AVG_LATENCY = 'avg_latency(ms)';
+my $P99_LATENCY = 'p99_latency(ms)';
+my $P999_LATENCY = 'p999_latency(ms)';
+my @CONDITIONS = ([$QPS, 2000], ['recall', 0.9], ['recall', 0.95]);
+
+
+my $USAGE = << 'END';
+usage: [-f] [-l avg|p99|p999] [-o output.csv] groundtruth.neighbors.ibin result_paths...
+  result_paths... are paths to the search result files.
+    Can specify multiple paths.
+    For each of them, if it's a directory, all the .txt files found under
+    it recursively will be regarded as inputs.
+
+  -f: force to recompute recall and update it in result file if needed
+  -l: output search latency rather than QPS. Available options:
+        "avg" for average latency;
+        "p99" for 99th percentile latency;
+        "p999" for 99.9th percentile latency.
+  -o: also write result to a csv file
+END
+
+
+my %opt;
+getopts('fl:o:', \%opt)
+  or die $USAGE;
+my $force_calc_recall = exists $opt{f} ? 1 : 0;
+my $csv_file;
+$csv_file = $opt{o} if exists $opt{o};
+my $metric = $QPS;
+if (exists $opt{l}) {
+    my $option = $opt{l};
+    if ($option eq 'avg') {
+        $metric = $AVG_LATENCY;
+    }
+    elsif ($option eq 'p99') {
+        $metric = $P99_LATENCY;
+    }
+    elsif ($option eq 'p999') {
+        $metric = $P999_LATENCY;
+    }
+    else {
+        die
+          "[error] illegal value for '-l': '$option'. Must be 'avg', 'p99' or 'p999'\n";
+    }
+}
+
+@ARGV >= 2
+  or die $USAGE;
+
+
+my $truth_file = shift @ARGV;
+my ($k, $dataset, $distance, $results) = get_all_results($metric, @ARGV);
+if (!defined $k) {
+    print STDERR "no result file found\n";
+    exit -1;
+}
+print STDERR "dataset = $dataset, distance = $distance, k = $k\n\n";
+calc_missing_recall($results, $truth_file, $force_calc_recall);
+
+my @results = sort {
+         $a->{name} cmp $b->{name}
+      or $a->{recall} <=> $b->{recall}
+      or $b->{qps} <=> $a->{qps}
+} @$results;
+printf("%-60s  %6s %16s  %s\n", '', 'Recall', $metric, 'search_param');
+for my $result (@results) {
+    my $fmt = ($metric eq $QPS) ? '%16.1f' : '%16.3f';
+    my $qps = $result->{qps};
+    $qps *= 1000 if $metric ne $QPS;    # the unit of latency is ms
+    printf("%-60s  %6.4f ${fmt}  %s\n",
+        $result->{name}, $result->{recall}, $qps, $result->{search_param});
+}
+if (defined $csv_file) {
+    open my $fh, '>', $csv_file;
+    print {$fh} ",Recall,${metric},search_param\n";
+    for my $result (@results) {
+        my $qps = $result->{qps};
+        $qps *= 1000 if $metric ne $QPS;
+        printf {$fh} (
+            "%s,%.4f,%.3f,%s\n", $result->{name}, $result->{recall},
+            $qps, $result->{search_param}
+        );
+    }
+}
+print "\n";
+calc_and_print_estimation($results, $metric, \@CONDITIONS);
+
+
+
+
+sub read_result {
+    my ($fname) = @_;
+    open my $fh, '<', $fname;
+    my %attr;
+    while (<$fh>) {
+        chomp;
+        next if /^\s*$/;
+        my $pos = index($_, ':');
+        $pos != -1
+          or die "[error] no ':' is found: '$_'\n";
+        my $key = substr($_, 0, $pos);
+        my $val = substr($_, $pos + 1);
+        $key =~ s/^\s+|\s+$//g;
+        $val =~ s/^\s+|\s+$//g;
+
+        # old version benchmark compatible
+        if ($key eq 'search_time') {
+            $key = 'average_search_time';
+            $val *= $attr{batch_size};
+        }
+        $attr{$key} = $val;
+    }
+    return \%attr;
+}
+
+sub overwrite_recall_to_result {
+    my ($fname, $recall) = @_;
+    open my $fh_in, '<', $fname;
+    $recall = sprintf("%f", $recall);
+    my $out;
+    while (<$fh_in>) {
+        s/^recall: .*/recall: $recall/;
+        $out .= $_;
+    }
+    close $fh_in;
+
+    open my $fh_out, '>', $fname;
+    print {$fh_out} $out;
+}
+
+sub append_recall_to_result {
+    my ($fname, $recall) = @_;
+    open my $fh, '>>', $fname;
+    printf {$fh} ("recall: %f\n", $recall);
+}
+
+sub get_all_results {
+    my ($metric) = shift @_;
+
+    my %fname;
+    my $wanted = sub {
+        if (-f && /\.txt$/) {
+            $fname{$File::Find::name} = 1;
+        }
+    };
+    find($wanted, @_);
+
+    my $k;
+    my $dataset;
+    my $distance;
+    my @results;
+    for my $f (sort keys %fname) {
+        print STDERR "reading $f ...\n";
+        my $attr = read_result($f);
+        if (!defined $k) {
+            $k = $attr->{k};
+            $dataset = $attr->{dataset};
+            $distance = $attr->{distance};
+        }
+        else {
+            $attr->{k} eq $k
+              or die "[error] k should be $k, but is $attr->{k} in $f\n";
+            $attr->{dataset} eq $dataset
+              or die
+              "[error] dataset should be $dataset, but is $attr->{dataset} in $f\n";
+            $attr->{distance} eq $distance
+              or die
+              "[error] distance should be $distance, but is $attr->{distance} in $f\n";
+        }
+
+        my $batch_size = $attr->{batch_size};
+        $batch_size =~ s/000000$/M/;
+        $batch_size =~ s/000$/K/;
+        my $search_param = $attr->{search_param};
+        $search_param =~ s/^{//;
+        $search_param =~ s/}$//;
+        $search_param =~ s/,/ /g;
+        $search_param =~ s/"//g;
+
+        my $qps;
+        if ($metric eq $QPS) {
+            $qps = $attr->{batch_size} / $attr->{average_search_time};
+        }
+        elsif ($metric eq $AVG_LATENCY) {
+            $qps = $attr->{average_search_time};
+        }
+        elsif ($metric eq $P99_LATENCY) {
+            exists $attr->{p99_search_time}
+              or die "[error] p99_search_time is not found\n";
+            $qps = $attr->{p99_search_time};
+        }
+        elsif ($metric eq $P999_LATENCY) {
+            exists $attr->{p999_search_time}
+              or die "[error] p999_search_time is not found\n";
+            $qps = $attr->{p999_search_time};
+        }
+        else {
+            die "[error] unknown latency type: '$metric'\n";
+        }
+        my $result = {
+            file => $f,
+            name => "$attr->{name}-batch${batch_size}",
+            search_param => $search_param,
+            qps => $qps,
+        };
+
+        if (exists $attr->{recall}) {
+            $result->{recall} = $attr->{recall};
+        }
+        push @results, $result;
+    }
+    return $k, $dataset, $distance, \@results;
+}
+
+sub read_ibin {
+    my ($fname) = @_;
+
+    open my $fh, '<:raw', $fname;
+    my $raw;
+
+    read($fh, $raw, 8);
+    my ($nrows, $dim) = unpack('LL', $raw);
+
+    my $expected_size = 8 + $nrows * $dim * 4;
+    my $size = (stat($fh))[7];
+    $size == $expected_size
+      or die(
+        "[error] expected size is $expected_size, but actual size is $size\n");
+
+    read($fh, $raw, $nrows * $dim * 4) == $nrows * $dim * 4
+      or die "[error] read $fname failed\n";
+    my @data = unpack('l' x ($nrows * $dim), $raw);
+    return \@data, $nrows, $dim;
+}
+
+sub pick_k_neighbors {
+    my ($neighbors, $nrows, $ncols, $k) = @_;
+
+    my @res;
+    for my $i (0 .. $nrows - 1) {
+        my %neighbor_set;
+        for my $j (0 .. $k - 1) {
+            $neighbor_set{$neighbors->[$i * $ncols + $j]} = 1;
+        }
+        push @res, \%neighbor_set;
+    }
+    return \@res;
+}
+
+
+sub calc_recall {
+    my ($truth_k_neighbors, $result_neighbors, $nrows, $k) = @_;
+
+    my $recall = 0;
+    for my $i (0 .. $nrows - 1) {
+        my $tp = 0;
+        for my $j (0 .. $k - 1) {
+            my $neighbor = $result_neighbors->[$i * $k + $j];
+            ++$tp if exists $truth_k_neighbors->[$i]{$neighbor};
+        }
+        $recall += $tp;
+    }
+    return $recall / $k / $nrows;
+}
+
+sub calc_missing_recall {
+    my ($results, $truth_file, $force_calc_recall) = @_;
+
+    my $need_calc_recall = grep { !exists $_->{recall} } @$results;
+    return unless $need_calc_recall || $force_calc_recall;
+
+    my ($truth_neighbors, $nrows, $truth_k) = read_ibin($truth_file);
+    $truth_k >= $k
+      or die "[error] ground truth k ($truth_k) < k($k)\n";
+    my $truth_k_neighbors =
+      pick_k_neighbors($truth_neighbors, $nrows, $truth_k, $k);
+
+    for my $result (@$results) {
+        next if exists $result->{recall} && !$force_calc_recall;
+
+        my $result_bin_file = $result->{file};
+        $result_bin_file =~ s/txt$/ibin/;
+        print STDERR "calculating recall for $result_bin_file ...\n";
+        my ($result_neighbors, $result_nrows, $result_k) =
+          read_ibin($result_bin_file);
+        $result_k == $k
+          or die
+          "[error] k should be $k, but is $result_k in $result_bin_file\n";
+        $result_nrows == $nrows
+          or die
+          "[error] #row should be $nrows, but is $result_nrows in $result_bin_file\n";
+
+        my $recall =
+          calc_recall($truth_k_neighbors, $result_neighbors, $nrows, $k);
+        if (exists $result->{recall}) {
+            my $new_value = sprintf("%f", $recall);
+            if ($result->{recall} ne $new_value) {
+                print "update recall: $result->{recall} -> $new_value\n";
+                overwrite_recall_to_result($result->{file}, $recall);
+            }
+        }
+        else {
+            append_recall_to_result($result->{file}, $recall);
+        }
+        $result->{recall} = $recall;
+    }
+}
+
+
+sub estimate {
+    my ($results, $condition, $value) = @_;
+    my %point_of;
+    for my $result (@$results) {
+        my $point;
+        if ($condition eq 'recall') {
+            $point = [$result->{recall}, $result->{qps}];
+        }
+        else {
+            $point = [$result->{qps}, $result->{recall}];
+        }
+        push @{$point_of{$result->{name}}}, $point;
+    }
+
+    my @names = sort keys %point_of;
+    my @result;
+    for my $name (@names) {
+        my @points = sort { $a->[0] <=> $b->[0] } @{$point_of{$name}};
+        if ($value < $points[0][0] || $value > $points[$#points][0]) {
+            push @result, -1;
+            next;
+        }
+        elsif ($value == $points[0][0]) {
+            push @result, $points[0][1];
+            next;
+        }
+
+        for my $i (1 .. $#points) {
+            if ($points[$i][0] >= $value) {
+                push @result,
+                  linear_interpolation($value, @{$points[$i - 1]},
+                    @{$points[$i]});
+                last;
+            }
+        }
+    }
+    return \@names, \@result;
+}
+
+sub linear_interpolation {
+    my ($x, $x1, $y1, $x2, $y2) = @_;
+    return $y1 + ($x - $x1) * ($y2 - $y1) / ($x2 - $x1);
+}
+
+sub merge {
+    my ($all, $new, $scale) = @_;
+    @$all == @$new
+      or die "[error] length is not equal\n";
+    for my $i (0 .. @$all - 1) {
+        push @{$all->[$i]}, $new->[$i] * $scale;
+    }
+}
+
+sub calc_and_print_estimation {
+    my ($results, $metric, $conditions) = @_;
+
+    my @conditions = grep {
+        my $target = $_->[0];
+        if ($target eq 'recall' || $target eq $metric) {
+            1;
+        }
+        else {
+                 $target eq $QPS
+              || $target eq $AVG_LATENCY
+              || $target eq $P99_LATENCY
+              || $target eq $P999_LATENCY
+              or die "[error] unknown condition: '$target'\n";
+            0;
+        }
+    } @$conditions;
+
+    my @headers = map {
+        my $header;
+        if ($_->[0] eq 'recall') {
+            $header = $metric . '@recall' . $_->[1];
+        }
+        elsif ($_->[0] eq $metric) {
+            $header = 'recall@' . $metric . $_->[1];
+        }
+        $header;
+    } @conditions;
+
+    my $scale = ($metric eq $QPS) ? 1 : 1000;
+    my $estimations;
+    for my $condition (@conditions) {
+        my ($names, $estimate) = estimate($results, @$condition);
+        if (!defined $estimations) {
+            @$estimations = map { [$_] } @$names;
+        }
+        merge($estimations, $estimate, $scale);
+    }
+
+    my $fmt = "%-60s" . ("  %16s" x @headers) . "\n";
+    printf($fmt, '', @headers);
+    $fmt =~ s/16s/16.4f/g;
+    for (@$estimations) {
+        printf($fmt, @$_);
+    }
+}
diff --git a/cpp/cuann_bench/scripts/fbin_to_f16bin.py b/cpp/cuann_bench/scripts/fbin_to_f16bin.py
new file mode 100755
index 0000000000..4ea8988d87
--- /dev/null
+++ b/cpp/cuann_bench/scripts/fbin_to_f16bin.py
@@ -0,0 +1,46 @@
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import numpy as np
+
+
+def read_fbin(fname):
+    shape = np.fromfile(fname, dtype=np.uint32, count=2)
+    if float(shape[0]) * shape[1] * 4 > 2000000000:
+        data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape(
+            shape
+        )
+    else:
+        data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape)
+    return data
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if len(sys.argv) != 3:
+    print(
+        "usage: %s input.fbin output.f16bin" % (sys.argv[0]),
+        file=sys.stderr,
+    )
+    sys.exit(-1)
+
+data = read_fbin(sys.argv[1]).astype(np.float16)
+write_bin(sys.argv[2], data)
diff --git a/cpp/cuann_bench/scripts/hdf5_to_fbin.py b/cpp/cuann_bench/scripts/hdf5_to_fbin.py
new file mode 100755
index 0000000000..3ee57b6981
--- /dev/null
+++ b/cpp/cuann_bench/scripts/hdf5_to_fbin.py
@@ -0,0 +1,84 @@
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+import sys
+import numpy as np
+import h5py
+
+
+def normalize(x):
+    norm = np.linalg.norm(x, axis=1)
+    return (x.T / norm).T
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if len(sys.argv) != 2 and len(sys.argv) != 3:
+    print(
+        "usage: %s [-n] <input>.hdf5\n" % (sys.argv[0]),
+        "  -n: normalize base/query set\n",
+        "outputs: <input>.base.fbin\n",
+        "         <input>.query.fbin\n",
+        "         <input>.groundtruth.neighbors.ibin\n",
+        "         <input>.groundtruth.distances.fbin",
+        file=sys.stderr,
+    )
+    sys.exit(-1)
+
+need_normalize = False
+if len(sys.argv) == 3:
+    assert sys.argv[1] == "-n"
+    need_normalize = True
+fname_prefix = sys.argv[-1]
+assert fname_prefix.endswith(".hdf5")
+fname_prefix = fname_prefix[:-5]
+
+hdf5 = h5py.File(sys.argv[-1], "r")
+assert (
+    hdf5.attrs["distance"] == "angular"
+    or hdf5.attrs["distance"] == "euclidean"
+)
+assert hdf5["train"].dtype == np.float32
+assert hdf5["test"].dtype == np.float32
+assert hdf5["neighbors"].dtype == np.int32
+assert hdf5["distances"].dtype == np.float32
+
+base = hdf5["train"][:]
+query = hdf5["test"][:]
+if need_normalize:
+    base = normalize(base)
+    query = normalize(query)
+elif hdf5.attrs["distance"] == "angular":
+    print(
+        "warning: input has angular distance, specify -n to normalize base/query set!\n"
+    )
+
+output_fname = fname_prefix + ".base.fbin"
+print("writing", output_fname, "...")
+write_bin(output_fname, base)
+
+output_fname = fname_prefix + ".query.fbin"
+print("writing", output_fname, "...")
+write_bin(output_fname, query)
+
+output_fname = fname_prefix + ".groundtruth.neighbors.ibin"
+print("writing", output_fname, "...")
+write_bin(output_fname, hdf5["neighbors"][:])
+
+output_fname = fname_prefix + ".groundtruth.distances.fbin"
+print("writing", output_fname, "...")
+write_bin(output_fname, hdf5["distances"][:])
diff --git a/cpp/cuann_bench/scripts/split_groundtruth.pl b/cpp/cuann_bench/scripts/split_groundtruth.pl
new file mode 100755
index 0000000000..b0a59f806c
--- /dev/null
+++ b/cpp/cuann_bench/scripts/split_groundtruth.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+
+
+@ARGV == 2
+  or die "usage: $0 input output_prefix\n";
+
+open my $fh, '<:raw', $ARGV[0];
+
+my $raw;
+read($fh, $raw, 8);
+my ($nrows, $dim) = unpack('LL', $raw);
+
+my $expected_size = 8 + $nrows * $dim * (4 + 4);
+my $size = (stat($fh))[7];
+$size == $expected_size
+  or die("error: expected size is $expected_size, but actual size is $size\n");
+
+
+open my $fh_out1, '>:raw', "$ARGV[1].neighbors.ibin";
+open my $fh_out2, '>:raw', "$ARGV[1].distances.fbin";
+
+print {$fh_out1} $raw;
+print {$fh_out2} $raw;
+
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out1} $raw;
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out2} $raw;
diff --git a/cpp/cuann_bench/src/ann.h b/cpp/cuann_bench/src/ann.h
new file mode 100644
index 0000000000..fae1fe3977
--- /dev/null
+++ b/cpp/cuann_bench/src/ann.h
@@ -0,0 +1,89 @@
+#ifndef ANN_H_
+#define ANN_H_
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+
+namespace cuann {
+
+enum class Metric {
+  kInnerProduct,
+  kEuclidean,
+};
+
+enum class MemoryType {
+  Host,
+  HostMmap,
+  Device,
+};
+
+struct AlgoProperty {
+  MemoryType dataset_memory_type;
+  // neighbors/distances should have same memory type as queries
+  MemoryType query_memory_type;
+  bool need_dataset_when_search;
+};
+
+template <typename T>
+class ANN {
+ public:
+  struct AnnSearchParam {
+    virtual ~AnnSearchParam() = default;
+  };
+
+  ANN(Metric metric, int dim) : metric_(metric), dim_(dim) {}
+  virtual ~ANN() = default;
+
+  virtual void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) = 0;
+
+  virtual void set_search_param(const AnnSearchParam& param) = 0;
+  // TODO: this assumes that an algorithm can always return k results.
+  // This is not always possible.
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      size_t* neighbors,
+                      float* distances,
+                      cudaStream_t stream = 0) const = 0;
+
+  virtual void save(const std::string& file) const = 0;
+  virtual void load(const std::string& file)       = 0;
+
+  virtual AlgoProperty get_property() const = 0;
+
+  // Some algorithms don't save the building dataset in their indices.
+  // So they should be given the access to that dataset during searching.
+  // The advantage of this way is that index has smaller size
+  // and many indices can share one dataset.
+  //
+  // AlgoProperty::need_dataset_when_search of such algorithm should be true,
+  // and set_search_dataset() should save the passed-in pointer somewhere.
+  // The client code should call set_search_dataset() before searching,
+  // and should not release dataset before searching is finished.
+  virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){};
+
+ protected:
+  Metric metric_;
+  int dim_;
+};
+
+}  // namespace cuann
+
+#endif  // ANN_H_
diff --git a/cpp/cuann_bench/src/benchmark.cu b/cpp/cuann_bench/src/benchmark.cu
new file mode 100644
index 0000000000..f71650f383
--- /dev/null
+++ b/cpp/cuann_bench/src/benchmark.cu
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef NVTX
+#include <nvToolsExt.h>
+#endif
+#include <unistd.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "conf.h"
+#include "dataset.h"
+#include "factory.cuh"
+#include "util.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::unordered_set;
+using std::vector;
+using namespace benchmark;
+using cuann::MemoryType;
+
+// supported types: float, half (very few implementations support it), uint8_t, int8_t
+using data_t = float;
+
+bool check_file_exist(const vector<string>& files)
+{
+  bool ret = true;
+  unordered_set<string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) == processed.end() && !file_exists(file)) {
+      log_error("file '%s' doesn't exist or is not a regular file", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
+{
+  bool ret = true;
+  for (const auto& file : files) {
+    if (file_exists(file)) {
+      if (force_overwrite) {
+        log_warn("'%s' already exists, will overwrite it", file.c_str());
+      } else {
+        log_error("'%s' already exists, use '-f' to force overwriting", file.c_str());
+        ret = false;
+      }
+    }
+  }
+  return ret;
+}
+
+bool check_no_duplicate_file(const vector<string>& files)
+{
+  bool ret = true;
+  unordered_set<string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) != processed.end()) {
+      log_error("'%s' occurs more than once as output file, would be overwritten", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+bool mkdir(const vector<string>& dirs)
+{
+  unordered_set<string> processed;
+  for (const auto& dir : dirs) {
+    if (processed.find(dir) == processed.end() && !dir_exists(dir)) {
+      if (create_dir(dir)) {
+        log_info("mkdir '%s'", dir.c_str());
+      } else {
+        log_error("fail to create output directory '%s'", dir.c_str());
+        // won't create any other dir when problem occurs
+        return false;
+      }
+    }
+    processed.insert(dir);
+  }
+  return true;
+}
+
+bool check(const vector<Configuration::Index>& indices, bool build_mode, bool force_overwrite)
+{
+  vector<string> files_should_exist;
+  vector<string> dirs_should_exist;
+  vector<string> output_files;
+  for (const auto& index : indices) {
+    if (build_mode) {
+      output_files.push_back(index.file);
+      output_files.push_back(index.file + ".txt");
+
+      auto pos = index.file.rfind('/');
+      if (pos != string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
+    } else {
+      files_should_exist.push_back(index.file);
+      files_should_exist.push_back(index.file + ".txt");
+
+      output_files.push_back(index.search_result_file + ".0.ibin");
+      output_files.push_back(index.search_result_file + ".0.txt");
+
+      auto pos = index.search_result_file.rfind('/');
+      if (pos != string::npos) {
+        dirs_should_exist.push_back(index.search_result_file.substr(0, pos));
+      }
+    }
+  }
+
+  bool ret = true;
+  if (!check_file_exist(files_should_exist)) { ret = false; }
+  if (!check_file_not_exist(output_files, force_overwrite)) { ret = false; }
+  if (!check_no_duplicate_file(output_files)) { ret = false; }
+  if (ret && !mkdir(dirs_should_exist)) { ret = false; }
+  return ret;
+}
+
+void write_build_info(const string& file_prefix,
+                      const string& dataset,
+                      const string& distance,
+                      const string& name,
+                      const string& algo,
+                      const string& build_param,
+                      float build_time)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "build_time: " << build_time << endl;
+  ofs.close();
+  if (!ofs) { throw std::runtime_error("can't write to build info file: " + file_prefix + ".txt"); }
+}
+
+template <typename T>
+void build(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
+{
+  cudaStream_t stream;
+  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
+
+  log_info(
+    "base set from dataset '%s', #vector = %zu", dataset->name().c_str(), dataset->base_set_size());
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    const T* base_set_ptr = nullptr;
+    if (algo_property.dataset_memory_type == MemoryType::Host) {
+      log_info("%s", "loading base set to memory");
+      base_set_ptr = dataset->base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+      log_info("%s", "mapping base set to memory");
+      base_set_ptr = dataset->mapped_base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+      log_info("%s", "loading base set to GPU");
+      base_set_ptr = dataset->base_set_on_gpu();
+    }
+
+    log_info("building index '%s'", index.name.c_str());
+    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+#ifdef NVTX
+    nvtxRangePush("build");
+#endif
+    Timer timer;
+    algo->build(base_set_ptr, dataset->base_set_size(), stream);
+    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+    float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+    nvtxRangePop();
+#endif
+    log_info("built index in %.2f seconds", elapsed_ms / 1000.0f);
+    ANN_CUDA_CHECK_LAST_ERROR();
+
+    algo->save(index.file);
+    write_build_info(index.file,
+                     dataset->name(),
+                     dataset->distance(),
+                     index.name,
+                     index.algo,
+                     index.build_param.dump(),
+                     elapsed_ms / 1000.0f);
+    log_info("saved index to %s", index.file.c_str());
+  }
+
+  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+void write_search_result(const string& file_prefix,
+                         const string& dataset,
+                         const string& distance,
+                         const string& name,
+                         const string& algo,
+                         const string& build_param,
+                         const string& search_param,
+                         int batch_size,
+                         int run_count,
+                         int k,
+                         float search_time_average,
+                         float search_time_p99,
+                         float search_time_p999,
+                         const int* neighbors,
+                         size_t query_set_size)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "search_param: " << search_param << "\n"
+      << "\n"
+      << "batch_size: " << batch_size << "\n"
+      << "run_count: " << run_count << "\n"
+      << "k: " << k << "\n"
+      << "average_search_time: " << search_time_average << endl;
+  if (search_time_p99 != std::numeric_limits<float>::max()) {
+    ofs << "p99_search_time: " << search_time_p99 << endl;
+  }
+  if (search_time_p999 != std::numeric_limits<float>::max()) {
+    ofs << "p999_search_time: " << search_time_p999 << endl;
+  }
+  ofs.close();
+  if (!ofs) {
+    throw std::runtime_error("can't write to search result file: " + file_prefix + ".txt");
+  }
+
+  BinFile<int> neighbors_file(file_prefix + ".ibin", "w");
+  neighbors_file.write(neighbors, query_set_size, k);
+}
+
+template <typename T>
+void search(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
+{
+  if (indices.empty()) { return; }
+  cudaStream_t stream;
+  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
+
+  log_info("loading query set from dataset '%s', #vector = %zu",
+           dataset->name().c_str(),
+           dataset->query_set_size());
+  const T* query_set = dataset->query_set();
+  // query set is usually much smaller than base set, so load it eagerly
+  const T* d_query_set  = dataset->query_set_on_gpu();
+  size_t query_set_size = dataset->query_set_size();
+
+  // currently all indices has same batch_size, k and run_count
+  const int batch_size = indices[0].batch_size;
+  const int k          = indices[0].k;
+  const int run_count  = indices[0].run_count;
+  log_info(
+    "basic search parameters: batch_size = %d, k = %d, run_count = %d", batch_size, k, run_count);
+  if (query_set_size % batch_size != 0) {
+    log_warn("query set size (%zu) % batch size (%d) != 0, the size of last batch is %zu",
+             query_set_size,
+             batch_size,
+             query_set_size % batch_size);
+  }
+  const size_t num_batches = (query_set_size - 1) / batch_size + 1;
+  size_t* neighbors        = new size_t[query_set_size * k];
+  int* neighbors_buf       = new int[query_set_size * k];
+  float* distances         = new float[query_set_size * k];
+  vector<float> search_times;
+  search_times.reserve(num_batches);
+  size_t* d_neighbors;
+  float* d_distances;
+  ANN_CUDA_CHECK(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
+  ANN_CUDA_CHECK(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str());
+    algo->load(index.file);
+
+    const T* this_query_set = query_set;
+    size_t* this_neighbors  = neighbors;
+    float* this_distances   = distances;
+    if (algo_property.query_memory_type == MemoryType::Device) {
+      this_query_set = d_query_set;
+      this_neighbors = d_neighbors;
+      this_distances = d_distances;
+    }
+
+    if (algo_property.need_dataset_when_search) {
+      log_info("loading base set from dataset '%s', #vector = %zu",
+               dataset->name().c_str(),
+               dataset->base_set_size());
+      const T* base_set_ptr = nullptr;
+      if (algo_property.dataset_memory_type == MemoryType::Host) {
+        log_info("%s", "loading base set to memory");
+        base_set_ptr = dataset->base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+        log_info("%s", "mapping base set to memory");
+        base_set_ptr = dataset->mapped_base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+        log_info("%s", "loading base set to GPU");
+        base_set_ptr = dataset->base_set_on_gpu();
+      }
+      algo->set_search_dataset(base_set_ptr, dataset->base_set_size());
+    }
+
+    for (int i = 0, end_i = index.search_params.size(); i != end_i; ++i) {
+      auto p_param = create_search_param<T>(index.algo, index.search_params[i]);
+      algo->set_search_param(*p_param);
+      log_info("search with param: %s", index.search_params[i].dump().c_str());
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        ANN_CUDA_CHECK(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
+        ANN_CUDA_CHECK(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
+      } else {
+        memset(neighbors, 0, query_set_size * k * sizeof(*neighbors));
+        memset(distances, 0, query_set_size * k * sizeof(*distances));
+      }
+
+      float best_search_time_average = std::numeric_limits<float>::max();
+      float best_search_time_p99     = std::numeric_limits<float>::max();
+      float best_search_time_p999    = std::numeric_limits<float>::max();
+      for (int run = 0; run < run_count; ++run) {
+        log_info("run %d / %d", run + 1, run_count);
+        for (size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
+          size_t row            = batch_id * batch_size;
+          int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
+          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+#ifdef NVTX
+          string nvtx_label = "batch" + to_string(batch_id);
+          if (run_count != 1) { nvtx_label = "run" + to_string(run) + "-" + nvtx_label; }
+          if (batch_id == 10) {
+            run = run_count - 1;
+            break;
+          }
+#endif
+          Timer timer;
+#ifdef NVTX
+          nvtxRangePush(nvtx_label.c_str());
+#endif
+          algo->search(this_query_set + row * dataset->dim(),
+                       actual_batch_size,
+                       k,
+                       this_neighbors + row * k,
+                       this_distances + row * k,
+                       stream);
+          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+          float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+          nvtxRangePop();
+#endif
+          // If the size of the last batch is less than batch_size, don't count it for
+          // search time. But neighbors of the last batch will still be filled, so it's
+          // counted for recall calculation.
+          if (actual_batch_size == batch_size) {
+            search_times.push_back(elapsed_ms / 1000.0f);  // in seconds
+          }
+        }
+
+        float search_time_average =
+          std::accumulate(search_times.cbegin(), search_times.cend(), 0.0f) / search_times.size();
+        best_search_time_average = std::min(best_search_time_average, search_time_average);
+
+        if (search_times.size() >= 100) {
+          std::sort(search_times.begin(), search_times.end());
+
+          auto calc_percentile_pos = [](float percentile, size_t N) {
+            return static_cast<size_t>(std::ceil(percentile / 100.0 * N)) - 1;
+          };
+
+          float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())];
+          best_search_time_p99  = std::min(best_search_time_p99, search_time_p99);
+
+          if (search_times.size() >= 1000) {
+            float search_time_p999 = search_times[calc_percentile_pos(99.9, search_times.size())];
+            best_search_time_p999  = std::min(best_search_time_p999, search_time_p999);
+          }
+        }
+        search_times.clear();
+      }
+      ANN_CUDA_CHECK_LAST_ERROR();
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        ANN_CUDA_CHECK(cudaMemcpy(neighbors,
+                                  d_neighbors,
+                                  query_set_size * k * sizeof(*d_neighbors),
+                                  cudaMemcpyDeviceToHost));
+        ANN_CUDA_CHECK(cudaMemcpy(distances,
+                                  d_distances,
+                                  query_set_size * k * sizeof(*d_distances),
+                                  cudaMemcpyDeviceToHost));
+      }
+
+      for (size_t j = 0; j < query_set_size * k; ++j) {
+        neighbors_buf[j] = neighbors[j];
+      }
+      write_search_result(index.search_result_file + "." + to_string(i),
+                          dataset->name(),
+                          dataset->distance(),
+                          index.name,
+                          index.algo,
+                          index.build_param.dump(),
+                          index.search_params[i].dump(),
+                          batch_size,
+                          index.run_count,
+                          k,
+                          best_search_time_average,
+                          best_search_time_p99,
+                          best_search_time_p999,
+                          neighbors_buf,
+                          query_set_size);
+    }
+
+    log_info("finish searching for index '%s'", index.name.c_str());
+  }
+
+  delete[] neighbors;
+  delete[] neighbors_buf;
+  delete[] distances;
+  ANN_CUDA_CHECK(cudaFree(d_neighbors));
+  ANN_CUDA_CHECK(cudaFree(d_distances));
+  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+const string usage(const string& argv0)
+{
+  return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
+         "   -b: build mode, will build index\n" +
+         "   -s: search mode, will search using built index\n" +
+         "       one and only one of -b and -s should be specified\n" +
+         "   -c: just check command line options and conf.json are sensible\n" +
+         "       won't build or search\n" + "   -f: force overwriting existing output files\n" +
+         "   -i: by default will build/search all the indices found in conf.json\n" +
+         "       '-i' can be used to select a subset of indices\n" +
+         "       'index_names' is a list of comma-separated index names\n" +
+         "       '*' is allowed as the last character of a name to select all matched indices\n" +
+         "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
+}
+
+int main(int argc, char** argv)
+{
+  bool force_overwrite = false;
+  bool build_mode      = false;
+  bool search_mode     = false;
+  bool only_check      = false;
+  string index_patterns("*");
+
+  int opt;
+  while ((opt = getopt(argc, argv, "bscfi:h")) != -1) {
+    switch (opt) {
+      case 'b': build_mode = true; break;
+      case 's': search_mode = true; break;
+      case 'c': only_check = true; break;
+      case 'f': force_overwrite = true; break;
+      case 'i': index_patterns = optarg; break;
+      case 'h': cout << usage(argv[0]) << endl; return -1;
+      default: cerr << "\n" << usage(argv[0]) << endl; return -1;
+    }
+  }
+  if (build_mode == search_mode) {
+    cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
+    return -1;
+  }
+  if (argc - optind != 1) {
+    cerr << usage(argv[0]) << endl;
+    return -1;
+  }
+  string conf_file = argv[optind];
+
+  std::ifstream conf_stream(conf_file.c_str());
+  if (!conf_stream) {
+    log_error("can't open configuration file: %s", argv[optind]);
+    return -1;
+  }
+
+  try {
+    Configuration conf(conf_stream);
+
+    auto dataset_conf = conf.get_dataset_conf();
+    BinDataset<data_t> dataset(dataset_conf.name,
+                               dataset_conf.base_file,
+                               dataset_conf.subset_first_row,
+                               dataset_conf.subset_size,
+                               dataset_conf.query_file,
+                               dataset_conf.distance);
+
+    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
+    if (!check(indices, build_mode, force_overwrite)) { return -1; }
+
+    string message = "will ";
+    message += build_mode ? "build:" : "search:";
+    for (const auto& index : indices) {
+      message += "\n  " + index.name;
+    }
+    log_info("%s", message.c_str());
+
+    if (only_check) {
+      log_info("%s", "all check passed, quit due to option -c");
+      return 0;
+    }
+
+    if (build_mode) {
+      build(&dataset, indices);
+    } else if (search_mode) {
+      search(&dataset, indices);
+    }
+  } catch (const std::exception& e) {
+    log_error("exception occurs: %s", e.what());
+    return -1;
+  }
+}
diff --git a/cpp/cuann_bench/src/conf.cpp b/cpp/cuann_bench/src/conf.cpp
new file mode 100644
index 0000000000..90b164c076
--- /dev/null
+++ b/cpp/cuann_bench/src/conf.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "conf.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "util.h"
+
+namespace benchmark {
+using std::runtime_error;
+using std::string;
+using std::unordered_set;
+using std::vector;
+
+Configuration::Configuration(std::istream& conf_stream)
+{
+  // to enable comments in json
+  auto conf = nlohmann::json::parse(conf_stream, nullptr, true, true);
+
+  parse_dataset_(conf.at("dataset"));
+  parse_index_(conf.at("index"), conf.at("search_basic_param"));
+}
+
+vector<Configuration::Index> Configuration::get_indices(const string& patterns) const
+{
+  vector<string> names;
+  for (const auto& index : indices_) {
+    names.push_back(index.name);
+  }
+
+  auto matched = match_(names, patterns);
+  if (matched.empty()) { throw runtime_error("no available index matches '" + patterns + "'"); }
+
+  vector<Index> res;
+  for (const auto& index : indices_) {
+    if (matched.find(index.name) != matched.end()) { res.push_back(index); }
+  }
+  return res;
+}
+
+void Configuration::parse_dataset_(const nlohmann::json& conf)
+{
+  dataset_conf_.name       = conf.at("name");
+  dataset_conf_.base_file  = conf.at("base_file");
+  dataset_conf_.query_file = conf.at("query_file");
+  dataset_conf_.distance   = conf.at("distance");
+
+  if (conf.contains("subset_first_row")) {
+    dataset_conf_.subset_first_row = conf.at("subset_first_row");
+  }
+  if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); }
+}
+
+void Configuration::parse_index_(const nlohmann::json& index_conf,
+                                 const nlohmann::json& search_basic_conf)
+{
+  const int batch_size = search_basic_conf.at("batch_size");
+  const int k          = search_basic_conf.at("k");
+  const int run_count  = search_basic_conf.at("run_count");
+
+  for (const auto& conf : index_conf) {
+    Index index;
+    index.name        = conf.at("name");
+    index.algo        = conf.at("algo");
+    index.build_param = conf.at("build_param");
+    index.file        = conf.at("file");
+    index.batch_size  = batch_size;
+    index.k           = k;
+    index.run_count   = run_count;
+
+    if (conf.contains("multigpu")) {
+      for (auto it : conf.at("multigpu")) {
+        index.dev_list.push_back(it);
+      }
+      if (index.dev_list.empty()) { throw std::runtime_error("dev_list shouln't be empty!"); }
+      index.dev_list.shrink_to_fit();
+      index.build_param["multigpu"] = conf["multigpu"];
+    }
+
+    if (conf.contains("refine_ratio")) {
+      float refine_ratio = conf.at("refine_ratio");
+      if (refine_ratio <= 1.0f) {
+        throw runtime_error("'" + index.name + "': refine_ratio should > 1.0");
+      }
+      index.refine_ratio = refine_ratio;
+    }
+
+    for (const auto& param : conf.at("search_params")) {
+      index.search_params.push_back(param);
+    }
+    index.search_result_file = conf.at("search_result_file");
+
+    indices_.push_back(index);
+  }
+}
+
+unordered_set<string> Configuration::match_(const vector<string>& candidates,
+                                            const string& patterns) const
+{
+  unordered_set<string> matched;
+  for (const auto& pat : split(patterns, ',')) {
+    if (pat.empty()) { continue; }
+
+    if (pat.back() == '*') {
+      auto len = pat.size() - 1;
+      for (const auto& item : candidates) {
+        if (item.compare(0, len, pat, 0, len) == 0) { matched.insert(item); }
+      }
+    } else {
+      for (const auto& item : candidates) {
+        if (item == pat) { matched.insert(item); }
+      }
+    }
+  }
+
+  return matched;
+}
+
+}  // namespace benchmark
diff --git a/cpp/cuann_bench/src/conf.h b/cpp/cuann_bench/src/conf.h
new file mode 100644
index 0000000000..987f2d52aa
--- /dev/null
+++ b/cpp/cuann_bench/src/conf.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CONF_H_
+#define CONF_H_
+
+#include <iostream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace benchmark {
+
+class Configuration {
+ public:
+  struct Index {
+    std::string name;
+    std::string algo;
+    nlohmann::json build_param;
+    std::string file;
+    std::vector<int> dev_list;
+
+    int batch_size;
+    int k;
+    int run_count;
+    std::vector<nlohmann::json> search_params;
+    std::string search_result_file;
+    float refine_ratio{0.0f};
+  };
+
+  struct DatasetConf {
+    std::string name;
+    std::string base_file;
+    // use only a subset of base_file,
+    // the range of rows is [subset_first_row, subset_first_row + subset_size)
+    // however, subset_size = 0 means using all rows after subset_first_row
+    // that is, the subset is [subset_first_row, #rows in base_file)
+    size_t subset_first_row{0};
+    size_t subset_size{0};
+    std::string query_file;
+    std::string distance;
+  };
+
+  Configuration(std::istream& conf_stream);
+
+  DatasetConf get_dataset_conf() const { return dataset_conf_; }
+  std::vector<Index> get_indices(const std::string& patterns) const;
+
+ private:
+  void parse_dataset_(const nlohmann::json& conf);
+  void parse_index_(const nlohmann::json& index_conf, const nlohmann::json& search_basic_conf);
+  std::unordered_set<std::string> match_(const std::vector<std::string>& candidates,
+                                         const std::string& patterns) const;
+
+  DatasetConf dataset_conf_;
+  std::vector<Index> indices_;
+};
+
+}  // namespace benchmark
+
+#endif
diff --git a/cpp/cuann_bench/src/cudart_util.h b/cpp/cuann_bench/src/cudart_util.h
new file mode 100644
index 0000000000..1d315ad532
--- /dev/null
+++ b/cpp/cuann_bench/src/cudart_util.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CUDART_UTIL_H_
+#define CUDART_UTIL_H_
+#include <stdexcept>
+#include <string>
+
+#include <cuda_runtime_api.h>
+
+#define ANN_CUDA_CHECK(call)                        \
+  {                                                 \
+    cuann::cuda_check_((call), __FILE__, __LINE__); \
+  }
+
+#ifndef NDEBUG
+#define ANN_CUDA_CHECK_LAST_ERROR()                    \
+  {                                                    \
+    cuann::cuda_check_last_error_(__FILE__, __LINE__); \
+  }
+#else
+#define ANN_CUDA_CHECK_LAST_ERROR()
+#endif
+
+namespace cuann {
+
+constexpr unsigned int WARP_FULL_MASK = 0xffffffff;
+constexpr int WARP_SIZE               = 32;
+
+class CudaException : public std::runtime_error {
+ public:
+  explicit CudaException(const std::string& what) : runtime_error(what) {}
+};
+
+inline void cuda_check_(cudaError_t val, const char* file, int line)
+{
+  if (val != cudaSuccess) {
+    throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
+                        std::to_string(val) + ": " + cudaGetErrorName(val) + ": " +
+                        cudaGetErrorString(val));
+  }
+}
+
+inline void cuda_check_last_error_(const char* file, int line)
+{
+  cudaDeviceSynchronize();
+  cudaError_t err = cudaPeekAtLastError();
+  cuda_check_(err, file, line);
+}
+
+}  // namespace cuann
+#endif
diff --git a/cpp/cuann_bench/src/dataset.h b/cpp/cuann_bench/src/dataset.h
new file mode 100644
index 0000000000..b756b204d4
--- /dev/null
+++ b/cpp/cuann_bench/src/dataset.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_H_
+#define DATASET_H_
+
+#include <cuda_fp16.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "cudart_util.h"
+
+namespace benchmark {
+
+// http://big-ann-benchmarks.com/index.html:
+// binary format that starts with 8 bytes of data consisting of num_points(uint32_t)
+// num_dimensions(uint32) followed by num_pts x num_dimensions x sizeof(type) bytes of
+// data stored one vector after another.
+// Data files will have suffixes .fbin, .u8bin, and .i8bin to represent float32, uint8
+// and int8 type data.
+// As extensions for this benchmark, half and int data files will have suffixes .f16bin
+// and .ibin, respectively.
+template <typename T>
+class BinFile {
+ public:
+  BinFile(const std::string& file,
+          const std::string& mode,
+          uint32_t subset_first_row = 0,
+          uint32_t subset_size      = 0);
+  ~BinFile() { fclose(fp_); }
+  BinFile(const BinFile&) = delete;
+  BinFile& operator=(const BinFile&) = delete;
+
+  void get_shape(size_t* nrows, int* ndims)
+  {
+    assert(read_mode_);
+    *nrows = nrows_;
+    *ndims = ndims_;
+  }
+
+  void read(T* data) const
+  {
+    assert(read_mode_);
+    size_t total = static_cast<size_t>(nrows_) * ndims_;
+    if (fread(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error("fread() BinFile " + file_ + " failed");
+    }
+  }
+
+  void write(const T* data, uint32_t nrows, uint32_t ndims)
+  {
+    assert(!read_mode_);
+    if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+    if (fwrite(&ndims, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+
+    size_t total = static_cast<size_t>(nrows) * ndims;
+    if (fwrite(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+  }
+
+  void* map() const
+  {
+    assert(read_mode_);
+    int fid       = fileno(fp_);
+    auto mmap_ptr = mmap(NULL, file_size_, PROT_READ, MAP_PRIVATE, fid, 0);
+    if (mmap_ptr == MAP_FAILED) {
+      throw std::runtime_error("mmap error: Value of errno " + std::to_string(errno) + ", " +
+                               std::string(strerror(errno)));
+    }
+    return mmap_ptr;
+  }
+
+  void unmap(void* data) const
+  {
+    if (munmap(data, file_size_) == -1) {
+      throw std::runtime_error("munmap error: " + std::string(strerror(errno)));
+    }
+  }
+
+ private:
+  void check_suffix_();
+
+  std::string file_;
+  FILE* fp_;
+  bool read_mode_;
+  uint32_t nrows_;
+  uint32_t ndims_;
+  size_t file_size_;
+};
+
+template <typename T>
+BinFile<T>::BinFile(const std::string& file,
+                    const std::string& mode,
+                    uint32_t subset_first_row,
+                    uint32_t subset_size)
+  : file_(file)
+{
+  check_suffix_();
+
+  if (mode == "r") {
+    read_mode_ = true;
+  } else if (mode == "w") {
+    read_mode_ = false;
+    if (subset_first_row != 0) {
+      throw std::runtime_error("subset_first_row should be zero for write mode");
+    }
+    if (subset_size != 0) { throw std::runtime_error("subset_size should be zero for write mode"); }
+  } else {
+    throw std::runtime_error("BinFile's mode must be either 'r' or 'w': " + file_);
+  }
+
+  fp_ = fopen(file_.c_str(), mode.c_str());
+  if (!fp_) { throw std::runtime_error("open BinFile failed: " + file_); }
+
+  if (read_mode_) {
+    struct stat statbuf;
+    if (stat(file_.c_str(), &statbuf) != 0) { throw std::runtime_error("stat() failed: " + file_); }
+    file_size_ = statbuf.st_size;
+
+    uint32_t header[2];
+    if (fread(header, sizeof(uint32_t), 2, fp_) != 2) {
+      throw std::runtime_error("read header of BinFile failed: " + file_);
+    }
+    nrows_ = header[0];
+    ndims_ = header[1];
+
+    size_t expected_file_size =
+      2 * sizeof(uint32_t) + static_cast<size_t>(nrows_) * ndims_ * sizeof(T);
+    if (file_size_ != expected_file_size) {
+      throw std::runtime_error("expected file size of " + file_ + " is " +
+                               std::to_string(expected_file_size) + ", however, actual size is " +
+                               std::to_string(file_size_));
+    }
+
+    if (subset_first_row >= nrows_) {
+      throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) +
+                               ") >= nrows (" + std::to_string(nrows_) + ")");
+    }
+    if (subset_first_row + subset_size > nrows_) {
+      throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) +
+                               ") + subset_size (" + std::to_string(subset_size) + ") > nrows (" +
+                               std::to_string(nrows_) + ")");
+    }
+
+    if (subset_first_row) {
+      static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset");
+      if (fseek(fp_, sizeof(T) * subset_first_row * ndims_, SEEK_CUR) == -1) {
+        throw std::runtime_error(file_ + ": fseek failed");
+      }
+      nrows_ -= subset_first_row;
+    }
+    if (subset_size) { nrows_ = subset_size; }
+  }
+}
+
+template <typename T>
+void BinFile<T>::check_suffix_()
+{
+  auto pos = file_.rfind('.');
+  if (pos == std::string::npos) {
+    throw std::runtime_error("name of BinFile doesn't have a suffix: " + file_);
+  }
+  std::string suffix = file_.substr(pos + 1);
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (suffix != "fbin") {
+      throw std::runtime_error("BinFile<float> should has .fbin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, half>) {
+    if (suffix != "f16bin") {
+      throw std::runtime_error("BinFile<half> should has .f16bin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, int>) {
+    if (suffix != "ibin") {
+      throw std::runtime_error("BinFile<int> should has .ibin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    if (suffix != "u8bin") {
+      throw std::runtime_error("BinFile<uint8_t> should has .u8bin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    if (suffix != "i8bin") {
+      throw std::runtime_error("BinFile<int8_t> should has .i8bin suffix: " + file_);
+    }
+  } else {
+    throw std::runtime_error(
+      "T of BinFile<T> should be one of float, half, int, uint8_t, or int8_t");
+  }
+}
+
+template <typename T>
+class Dataset {
+ public:
+  Dataset(const std::string& name) : name_(name) {}
+  Dataset(const std::string& name, const std::string& distance) : name_(name), distance_(distance)
+  {
+  }
+  Dataset(const Dataset&) = delete;
+  Dataset& operator=(const Dataset&) = delete;
+  virtual ~Dataset();
+
+  std::string name() const { return name_; }
+  std::string distance() const { return distance_; }
+  int dim() const { return dim_; }
+  size_t base_set_size() const { return base_set_size_; }
+  size_t query_set_size() const { return query_set_size_; }
+
+  // load data lazily, so don't pay the overhead of reading unneeded set
+  // e.g. don't load base set when searching
+  const T* base_set() const
+  {
+    if (!base_set_) { load_base_set_(); }
+    return base_set_;
+  }
+
+  const T* query_set() const
+  {
+    if (!query_set_) { load_query_set_(); }
+    return query_set_;
+  }
+
+  const T* base_set_on_gpu() const;
+  const T* query_set_on_gpu() const;
+  const T* mapped_base_set() const;
+
+ protected:
+  virtual void load_base_set_() const  = 0;
+  virtual void load_query_set_() const = 0;
+  virtual void map_base_set_() const   = 0;
+
+  std::string name_;
+  std::string distance_;
+  int dim_;
+  size_t base_set_size_;
+  size_t query_set_size_;
+
+  mutable T* base_set_        = nullptr;
+  mutable T* query_set_       = nullptr;
+  mutable T* d_base_set_      = nullptr;
+  mutable T* d_query_set_     = nullptr;
+  mutable T* mapped_base_set_ = nullptr;
+};
+
+template <typename T>
+Dataset<T>::~Dataset()
+{
+  delete[] base_set_;
+  delete[] query_set_;
+  if (d_base_set_) { ANN_CUDA_CHECK(cudaFree(d_base_set_)); }
+  if (d_query_set_) { ANN_CUDA_CHECK(cudaFree(d_query_set_)); }
+}
+
+template <typename T>
+const T* Dataset<T>::base_set_on_gpu() const
+{
+  if (!d_base_set_) {
+    base_set();
+    ANN_CUDA_CHECK(cudaMalloc((void**)&d_base_set_, base_set_size_ * dim_ * sizeof(T)));
+    ANN_CUDA_CHECK(cudaMemcpy(
+      d_base_set_, base_set_, base_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return d_base_set_;
+}
+
+template <typename T>
+const T* Dataset<T>::query_set_on_gpu() const
+{
+  if (!d_query_set_) {
+    query_set();
+    ANN_CUDA_CHECK(cudaMalloc((void**)&d_query_set_, query_set_size_ * dim_ * sizeof(T)));
+    ANN_CUDA_CHECK(cudaMemcpy(
+      d_query_set_, query_set_, query_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return d_query_set_;
+}
+
+template <typename T>
+const T* Dataset<T>::mapped_base_set() const
+{
+  if (!mapped_base_set_) { map_base_set_(); }
+  return mapped_base_set_;
+}
+
+template <typename T>
+class BinDataset : public Dataset<T> {
+ public:
+  BinDataset(const std::string& name,
+             const std::string& base_file,
+             size_t subset_first_row,
+             size_t subset_size,
+             const std::string& query_file,
+             const std::string& distance);
+  ~BinDataset()
+  {
+    if (this->mapped_base_set_) {
+      base_file_.unmap(reinterpret_cast<char*>(this->mapped_base_set_) - subset_offset_);
+    }
+  }
+
+ private:
+  void load_base_set_() const override;
+  void load_query_set_() const override;
+  void map_base_set_() const override;
+
+  using Dataset<T>::dim_;
+  using Dataset<T>::base_set_size_;
+  using Dataset<T>::query_set_size_;
+
+  BinFile<T> base_file_;
+  BinFile<T> query_file_;
+
+  size_t subset_offset_;
+};
+
+template <typename T>
+BinDataset<T>::BinDataset(const std::string& name,
+                          const std::string& base_file,
+                          size_t subset_first_row,
+                          size_t subset_size,
+                          const std::string& query_file,
+                          const std::string& distance)
+  : Dataset<T>(name, distance),
+    base_file_(base_file, "r", subset_first_row, subset_size),
+    query_file_(query_file, "r"),
+    subset_offset_(2 * sizeof(uint32_t) + subset_first_row * dim_ * sizeof(T))
+{
+  base_file_.get_shape(&base_set_size_, &dim_);
+  int query_dim;
+  query_file_.get_shape(&query_set_size_, &query_dim);
+  if (query_dim != dim_) {
+    throw std::runtime_error("base set dim (" + std::to_string(dim_) + ") != query set dim (" +
+                             std::to_string(query_dim));
+  }
+}
+
+template <typename T>
+void BinDataset<T>::load_base_set_() const
+{
+  this->base_set_ = new T[base_set_size_ * dim_];
+  base_file_.read(this->base_set_);
+}
+
+template <typename T>
+void BinDataset<T>::load_query_set_() const
+{
+  this->query_set_ = new T[query_set_size_ * dim_];
+  query_file_.read(this->query_set_);
+}
+
+template <typename T>
+void BinDataset<T>::map_base_set_() const
+{
+  char* original_map_ptr = static_cast<char*>(base_file_.map());
+  this->mapped_base_set_ = reinterpret_cast<T*>(original_map_ptr + subset_offset_);
+}
+
+}  // namespace benchmark
+
+#endif
diff --git a/cpp/cuann_bench/src/factory.cuh b/cpp/cuann_bench/src/factory.cuh
new file mode 100644
index 0000000000..f708d2d4d8
--- /dev/null
+++ b/cpp/cuann_bench/src/factory.cuh
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FACTORY_H_
+#define FACTORY_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "ann.h"
+#undef WARP_SIZE
+#ifdef RAFT_CUANN_BENCH_USE_FAISS
+#include "faiss_wrapper.h"
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_GGNN
+#include "ggnn_wrapper.cuh"
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#include "hnswlib_wrapper.h"
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+#include "raft_wrapper.h"
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+#include "raft_ivf_flat_wrapper.h"
+extern template class cuann::RaftIvfFlatGpu<float, uint64_t>;
+extern template class cuann::RaftIvfFlatGpu<uint8_t, uint64_t>;
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+#include "raft_ivf_pq_wrapper.h"
+extern template class cuann::RaftIvfPQ<float, uint64_t>;
+extern template class cuann::RaftIvfPQ<uint8_t, uint64_t>;
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_MULTI_GPU
+#include "multigpu.cuh"
+#endif
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace benchmark {
+
+cuann::Metric parse_metric(const std::string& metric_str)
+{
+  if (metric_str == "inner_product") {
+    return cuann::Metric::kInnerProduct;
+  } else if (metric_str == "euclidean") {
+    return cuann::Metric::kEuclidean;
+  } else {
+    throw std::runtime_error("invalid metric: '" + metric_str + "'");
+  }
+}
+
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+template <typename T>
+void parse_build_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::BuildParam& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.M               = conf.at("M");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+#endif
+
+#ifdef RAFT_CUANN_BENCH_USE_FAISS
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuann::FaissGpuIVFFlat<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuann::FaissGpuIVFPQ<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  param.M     = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.usePrecomputed = conf.at("usePrecomputed");
+  } else {
+    param.usePrecomputed = false;
+  }
+  if (conf.contains("useFloat16")) {
+    param.useFloat16 = conf.at("useFloat16");
+  } else {
+    param.useFloat16 = false;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuann::FaissGpuIVFSQ<T>::BuildParam& param)
+{
+  param.nlist          = conf.at("nlist");
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf, typename cuann::FaissGpu<T>::SearchParam& param)
+{
+  param.nprobe = conf.at("nprobe");
+}
+#endif
+
+#ifdef RAFT_CUANN_BENCH_USE_GGNN
+template <typename T>
+void parse_build_param(const nlohmann::json& conf, typename cuann::Ggnn<T>::BuildParam& param)
+{
+  param.dataset_size = conf.at("dataset_size");
+  param.k            = conf.at("k");
+
+  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
+  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
+  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
+  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
+  if (conf.contains("refine_iterations")) {
+    param.refine_iterations = conf.at("refine_iterations");
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf, typename cuann::Ggnn<T>::SearchParam& param)
+{
+  param.tau = conf.at("tau");
+
+  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
+  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
+}
+#endif
+
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
+    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuann::RaftIvfPQ<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuann::RaftIvfPQ<T, IdxT>::SearchParam& param)
+{
+  param.pq_param.n_probes = conf.at("numProbes");
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+}
+#endif
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric, int dim, const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric,
+                                         int dim,
+                                         const nlohmann::json& conf,
+                                         const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+#ifdef RAFT_CUANN_BENCH_USE_MULTI_GPU
+  if (dev_list.empty()) {
+    return std::make_unique<Algo<T>>(metric, dim, param);
+  } else {
+    return std::make_unique<cuann::MultiGpuANN<T, Algo<T>>>(metric, dim, param, dev_list);
+  }
+#else
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+#endif
+}
+
+template <typename T>
+std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
+                                           const std::string& distance,
+                                           int dim,
+                                           float refine_ratio,
+                                           const nlohmann::json& conf,
+                                           const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+#ifndef RAFT_CUANN_BENCH_USE_MULTI_GPU
+  if (!dev_list.empty()) {
+    throw std::runtime_error(
+      "compiled without RAFT_CUANN_BENCH_USE_MULTI_GPU, but a device list is given");
+  }
+#endif
+
+  cuann::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_FAISS
+    if (algo == "faiss_gpu_ivf_flat") {
+      ann = make_algo<T, cuann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
+    } else if (algo == "faiss_gpu_ivf_pq") {
+      ann = make_algo<T, cuann::FaissGpuIVFPQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_ivf_sq") {
+      ann = make_algo<T, cuann::FaissGpuIVFSQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_flat") {
+      ann = std::make_unique<cuann::FaissGpuFlat<T>>(metric, dim);
+    }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+    if (algo == "raft_bfknn") { ann = std::make_unique<cuann::RaftGpu<T>>(metric, dim); }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
+#endif
+  }
+
+#ifdef RAFT_CUANN_BENCH_USE_GGNN
+  if (algo == "ggnn") { ann = make_algo<T, cuann::Ggnn>(metric, dim, conf); }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    typename cuann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
+    parse_build_param<T, uint64_t>(conf, param);
+    ann = std::make_unique<cuann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
+  }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    typename cuann::RaftIvfPQ<T, uint64_t>::BuildParam param;
+    parse_build_param<T, uint64_t>(conf, param);
+    ann = std::make_unique<cuann::RaftIvfPQ<T, uint64_t>>(metric, dim, param, refine_ratio);
+  }
+#endif
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+  if (algo == "hnswlib") {
+    auto param = std::make_unique<typename cuann::HnswLib<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_FAISS
+  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
+    auto param = std::make_unique<typename cuann::FaissGpu<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo == "faiss_gpu_flat") {
+    auto param = std::make_unique<typename cuann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+  if (algo == "raft_bfknn") {
+    auto param = std::make_unique<typename cuann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_GGNN
+  if (algo == "ggnn") {
+    auto param = std::make_unique<typename cuann::Ggnn<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    auto param = std::make_unique<typename cuann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
+    parse_search_param<T, uint64_t>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    auto param = std::make_unique<typename cuann::RaftIvfPQ<T, uint64_t>::SearchParam>();
+    parse_search_param<T, uint64_t>(conf, *param);
+    return param;
+  }
+#endif
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace benchmark
+#endif
diff --git a/cpp/cuann_bench/src/faiss_wrapper.h b/cpp/cuann_bench/src/faiss_wrapper.h
new file mode 100644
index 0000000000..803808e29c
--- /dev/null
+++ b/cpp/cuann_bench/src/faiss_wrapper.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FAISS_WRAPPER_H_
+#define FAISS_WRAPPER_H_
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/index_io.h>
+#include <omp.h>
+
+#include <cassert>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "ann.h"
+
+namespace {
+
+faiss::MetricType parse_metric_type(cuann::Metric metric)
+{
+  if (metric == cuann::Metric::kInnerProduct) {
+    return faiss::METRIC_INNER_PRODUCT;
+  } else if (metric == cuann::Metric::kEuclidean) {
+    return faiss::METRIC_L2;
+  } else {
+    throw std::runtime_error("faiss supports only metric type of inner product and L2");
+  }
+}
+
+// note BLAS library can still use multi-threading, and
+// setting environment variable like OPENBLAS_NUM_THREADS can control it
+class OmpSingleThreadScope {
+ public:
+  OmpSingleThreadScope()
+  {
+    max_threads_ = omp_get_max_threads();
+    omp_set_num_threads(1);
+  }
+  ~OmpSingleThreadScope()
+  {
+    // the best we can do
+    omp_set_num_threads(max_threads_);
+  }
+
+ private:
+  int max_threads_;
+};
+
+}  // namespace
+
+namespace cuann {
+
+template <typename T>
+class FaissGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int nprobe;
+  };
+
+  FaissGpu(Metric metric, int dim, int nlist);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const final;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    // to enable building big dataset which is larger than GPU memory
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+
+ protected:
+  template <typename GpuIndex, typename CpuIndex>
+  void save_(const std::string& file) const;
+
+  template <typename GpuIndex, typename CpuIndex>
+  void load_(const std::string& file);
+
+  mutable faiss::gpu::StandardGpuResources gpu_resource_;
+  std::unique_ptr<faiss::gpu::GpuIndex> index_;
+  faiss::MetricType metric_type_;
+  int nlist_;
+  int device_;
+};
+
+template <typename T>
+FaissGpu<T>::FaissGpu(Metric metric, int dim, int nlist)
+  : ANN<T>(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist)
+{
+  static_assert(std::is_same_v<T, float>, "faiss support only float type");
+  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void FaissGpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  gpu_resource_.setDefaultStream(device_, stream);
+  index_->train(nrow, dataset);  // faiss::gpu::GpuIndexFlat::train() will do nothing
+  assert(index_->is_trained);
+  index_->add(nrow, dataset);
+}
+
+template <typename T>
+void FaissGpu<T>::set_search_param(const AnnSearchParam& param)
+{
+  int nprobe = dynamic_cast<const SearchParam&>(param).nprobe;
+  assert(nprobe <= nlist_);
+  dynamic_cast<faiss::gpu::GpuIndexIVF*>(index_.get())->setNumProbes(nprobe);
+}
+
+template <typename T>
+void FaissGpu<T>::search(const T* queries,
+                         int batch_size,
+                         int k,
+                         size_t* neighbors,
+                         float* distances,
+                         cudaStream_t stream) const
+{
+  static_assert(sizeof(size_t) == sizeof(faiss::Index::idx_t),
+                "sizes of size_t and faiss::Index::idx_t are different");
+  gpu_resource_.setDefaultStream(device_, stream);
+  index_->search(
+    batch_size, queries, k, distances, reinterpret_cast<faiss::Index::idx_t*>(neighbors));
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void FaissGpu<T>::save_(const std::string& file) const
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  auto cpu_index = std::make_unique<CpuIndex>();
+  dynamic_cast<GpuIndex*>(index_.get())->copyTo(cpu_index.get());
+  faiss::write_index(cpu_index.get(), file.c_str());
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void FaissGpu<T>::load_(const std::string& file)
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  std::unique_ptr<CpuIndex> cpu_index(dynamic_cast<CpuIndex*>(faiss::read_index(file.c_str())));
+  assert(cpu_index);
+  dynamic_cast<GpuIndex*>(index_.get())->copyFrom(cpu_index.get());
+}
+
+template <typename T>
+class FaissGpuIVFFlat : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+  };
+
+  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFFlat>(
+      &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+};
+
+template <typename T>
+class FaissGpuIVFPQ : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+    int M;
+    bool useFloat16;
+    bool usePrecomputed;
+  };
+
+  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.useFloat16LookupTables = param.useFloat16;
+    config.usePrecomputedTables   = param.usePrecomputed;
+    config.device                 = this->device_;
+    this->index_ =
+      std::make_unique<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
+                                                  dim,
+                                                  param.nlist,
+                                                  param.M,
+                                                  8,  // FAISS only supports bitsPerCode=8
+                                                  this->metric_type_,
+                                                  config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+};
+
+template <typename T>
+class FaissGpuIVFSQ : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+    std::string quantizer_type;
+  };
+
+  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::ScalarQuantizer::QuantizerType qtype;
+    if (param.quantizer_type == "fp16") {
+      qtype = faiss::ScalarQuantizer::QT_fp16;
+    } else if (param.quantizer_type == "int8") {
+      qtype = faiss::ScalarQuantizer::QT_8bit;
+    } else {
+      throw std::runtime_error("FaissGpuIVFSQ supports only fp16 and int8 but got " +
+                               param.quantizer_type);
+    }
+
+    faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFScalarQuantizer>(
+      &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+};
+
+template <typename T>
+class FaissGpuFlat : public FaissGpu<T> {
+ public:
+  FaissGpuFlat(Metric metric, int dim) : FaissGpu<T>(metric, dim, 0)
+  {
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexFlat>(
+      &(this->gpu_resource_), dim, this->metric_type_, config);
+  }
+
+  // class FaissGpu is more like a IVF class, so need special treating here
+  void set_search_param(const typename ANN<T>::AnnSearchParam&) override{};
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+};
+
+}  // namespace cuann
+
+#endif
diff --git a/cpp/cuann_bench/src/ggnn_wrapper.cuh b/cpp/cuann_bench/src/ggnn_wrapper.cuh
new file mode 100644
index 0000000000..d0ae3d3c1b
--- /dev/null
+++ b/cpp/cuann_bench/src/ggnn_wrapper.cuh
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef GGNN_WRAPPER_H_
+#define GGNN_WRAPPER_H_
+
+#include <memory>
+#include <stdexcept>
+
+#include "ann.h"
+#include "cudart_util.h"
+#include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
+
+namespace cuann {
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class GgnnImpl;
+
+template <typename T>
+class Ggnn : public ANN<T> {
+ public:
+  struct BuildParam {
+    int k_build{24};       // KBuild
+    int segment_size{32};  // S
+    int num_layers{4};     // L
+    float tau{0.5};
+    int refine_iterations{2};
+
+    size_t dataset_size;
+    int k;  // GGNN requires to know k during building
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    float tau;
+    int block_dim{32};
+    int max_iterations{400};
+    int cache_size{512};
+    int sorted_size{256};
+  };
+
+  Ggnn(Metric metric, int dim, const BuildParam& param);
+  ~Ggnn() { delete impl_; }
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override
+  {
+    impl_->build(dataset, nrow, stream);
+  }
+
+  void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); }
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override
+  {
+    impl_->search(queries, batch_size, k, neighbors, distances, stream);
+  }
+
+  void save(const std::string& file) const override { impl_->save(file); }
+  void load(const std::string& file) override { impl_->load(file); }
+
+  AlgoProperty get_property() const override { return impl_->get_property(); }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override
+  {
+    impl_->set_search_dataset(dataset, nrow);
+  };
+
+ private:
+  ANN<T>* impl_;
+};
+
+template <typename T>
+Ggnn<T>::Ggnn(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric, dim)
+{
+  // ggnn/src/sift1m.cu
+  if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 &&
+      param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Euclidean, 128, 24, 10, 32>(metric, dim, param);
+  }
+  // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B
+  else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 &&
+           param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Euclidean, 96, 24, 10, 32>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 &&
+             param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Cosine, 96, 24, 10, 32>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 &&
+             param.segment_size == 64) {
+    impl_ = new GgnnImpl<T, Cosine, 96, 96, 10, 64>(metric, dim, param);
+  }
+  // ggnn/src/glove200.cu, adapt it to glove100
+  else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 &&
+           param.segment_size == 64) {
+    impl_ = new GgnnImpl<T, Cosine, 100, 96, 10, 64>(metric, dim, param);
+  } else {
+    throw std::runtime_error(
+      "ggnn: not supported combination of metric, dim and build param; "
+      "see Ggnn's constructor in ggnn_wrapper.cuh for available combinations");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class GgnnImpl : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  GgnnImpl(Metric metric, int dim, const typename Ggnn<T>::BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& file) const override;
+  void load(const std::string& file) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;
+    return property;
+  }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+ private:
+  using ANN<T>::metric_;
+  using ANN<T>::dim_;
+
+  using GGNNGPUInstance = GGNNGPUInstance<measure,
+                                          int64_t /* KeyT */,
+                                          float /* ValueT */,
+                                          size_t /* GAddrT */,
+                                          T /* BaseT */,
+                                          size_t /* BAddrT */,
+                                          D,
+                                          KBuild,
+                                          KBuild / 2 /* KF */,
+                                          KQuery,
+                                          S>;
+  std::unique_ptr<GGNNGPUInstance> ggnn_;
+  typename Ggnn<T>::BuildParam build_param_;
+  typename Ggnn<T>::SearchParam search_param_;
+};
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
+                                                     int dim,
+                                                     const typename Ggnn<T>::BuildParam& param)
+  : ANN<T>(metric, dim), build_param_(param)
+{
+  if (metric_ == Metric::kInnerProduct) {
+    if (measure != Cosine) { throw std::runtime_error("mis-matched metric"); }
+  } else if (metric_ == Metric::kEuclidean) {
+    if (measure != Euclidean) { throw std::runtime_error("mis-matched metric"); }
+  } else {
+    throw std::runtime_error(
+      "ggnn supports only metric type of InnerProduct, Cosine and Euclidean");
+  }
+
+  if (dim != D) { throw std::runtime_error("mis-matched dim"); }
+
+  int device;
+  ANN_CUDA_CHECK(cudaGetDevice(&device));
+
+  ggnn_ = std::make_unique<GGNNGPUInstance>(
+    device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
+                                                       size_t nrow,
+                                                       cudaStream_t stream)
+{
+  if (nrow != build_param_.dataset_size) {
+    throw std::runtime_error(
+      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
+      " , but nrow = " + std::to_string(nrow));
+  }
+
+  ggnn_->set_base_data(dataset);
+  ggnn_->set_stream(stream);
+  ggnn_->build(0);
+  for (int i = 0; i < build_param_.refine_iterations; ++i) {
+    ggnn_->refine();
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  if (nrow != build_param_.dataset_size) {
+    throw std::runtime_error(
+      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
+      " , but nrow = " + std::to_string(nrow));
+  }
+  ggnn_->set_base_data(dataset);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_param(const AnnSearchParam& param)
+{
+  search_param_ = dynamic_cast<const typename Ggnn<T>::SearchParam&>(param);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(const T* queries,
+                                                        int batch_size,
+                                                        int k,
+                                                        size_t* neighbors,
+                                                        float* distances,
+                                                        cudaStream_t stream) const
+{
+  static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
+  if (k != KQuery) {
+    throw std::runtime_error(
+      "k = " + std::to_string(k) +
+      ", but this GGNN instance only supports k = " + std::to_string(KQuery));
+  }
+
+  ggnn_->set_stream(stream);
+  ANN_CUDA_CHECK(cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float)));
+
+  const int block_dim      = search_param_.block_dim;
+  const int max_iterations = search_param_.max_iterations;
+  const int cache_size     = search_param_.cache_size;
+  const int sorted_size    = search_param_.sorted_size;
+  // default value
+  if (block_dim == 32 && max_iterations == 400 && cache_size == 512 && sorted_size == 256) {
+    ggnn_->template queryLayer<32, 400, 512, 256, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 200 && cache_size == 256 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 200, 256, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 400 && cache_size == 448 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 400, 448, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/glove200.cu
+  else if (block_dim == 128 && max_iterations == 2000 && cache_size == 2048 && sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 2048, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // for glove100
+  else if (block_dim == 64 && max_iterations == 400 && cache_size == 512 && sorted_size == 32) {
+    ggnn_->template queryLayer<64, 400, 512, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else if (block_dim == 128 && max_iterations == 2000 && cache_size == 1024 &&
+             sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 1024, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else {
+    throw std::runtime_error("ggnn: not supported search param");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::save(const std::string& file) const
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(0);
+
+  ggnn_host.downloadAsync(ggnn_device);
+  ANN_CUDA_CHECK(cudaStreamSynchronize(ggnn_device.stream));
+  ggnn_host.store(file);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(0);
+
+  ggnn_host.load(file);
+  ggnn_host.uploadAsync(ggnn_device);
+  ANN_CUDA_CHECK(cudaStreamSynchronize(ggnn_device.stream));
+}
+
+}  // namespace cuann
+
+#endif
diff --git a/cpp/cuann_bench/src/hnswlib_wrapper.h b/cpp/cuann_bench/src/hnswlib_wrapper.h
new file mode 100644
index 0000000000..c2241253a5
--- /dev/null
+++ b/cpp/cuann_bench/src/hnswlib_wrapper.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef HNSWLIB_WRAPPER_H_
+#define HNSWLIB_WRAPPER_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <ctime>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "ann.h"
+#include <hnswlib.h>
+
+namespace cuann {
+
+namespace {
+template <typename T>
+struct hnsw_dist_t {
+  using type = void;
+};
+
+template <>
+struct hnsw_dist_t<float> {
+  using type = float;
+};
+
+template <>
+struct hnsw_dist_t<uint8_t> {
+  using type = int;
+};
+
+class FixedThreadPool {
+ public:
+  FixedThreadPool(int num_threads)
+  {
+    if (num_threads < 1) {
+      throw std::runtime_error("num_threads must >= 1");
+    } else if (num_threads == 1) {
+      return;
+    }
+
+    tasks_ = new Task_[num_threads];
+
+    threads_.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      threads_.emplace_back([&, i] {
+        auto& task = tasks_[i];
+        while (true) {
+          std::unique_lock<std::mutex> lock(task.mtx);
+          task.cv.wait(lock,
+                       [&] { return task.has_task || finished_.load(std::memory_order_relaxed); });
+          if (finished_.load(std::memory_order_relaxed)) { break; }
+
+          task.task();
+          task.has_task = false;
+        }
+      });
+    }
+  }
+
+  ~FixedThreadPool()
+  {
+    if (threads_.empty()) { return; }
+
+    finished_.store(true, std::memory_order_relaxed);
+    for (unsigned i = 0; i < threads_.size(); ++i) {
+      auto& task = tasks_[i];
+      std::lock_guard<std::mutex>(task.mtx);
+
+      task.cv.notify_one();
+      threads_[i].join();
+    }
+
+    delete[] tasks_;
+  }
+
+  template <typename Func, typename IdxT>
+  void submit(Func f, IdxT len)
+  {
+    if (threads_.empty()) {
+      for (IdxT i = 0; i < len; ++i) {
+        f(i);
+      }
+      return;
+    }
+
+    const int num_threads = threads_.size();
+    // one extra part for competition among threads
+    const IdxT items_per_thread = len / (num_threads + 1);
+    std::atomic<IdxT> cnt(items_per_thread * num_threads);
+
+    auto wrapped_f = [&](IdxT start, IdxT end) {
+      for (IdxT i = start; i < end; ++i) {
+        f(i);
+      }
+
+      while (true) {
+        IdxT i = cnt.fetch_add(1, std::memory_order_relaxed);
+        if (i >= len) { break; }
+        f(i);
+      }
+    };
+
+    std::vector<std::future<void>> futures;
+    futures.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      IdxT start = i * items_per_thread;
+      auto& task = tasks_[i];
+      {
+        std::lock_guard lock(task.mtx);
+        (void)lock;  // stop nvcc warning
+        task.task = std::packaged_task<void()>([=] { wrapped_f(start, start + items_per_thread); });
+        futures.push_back(task.task.get_future());
+        task.has_task = true;
+      }
+      task.cv.notify_one();
+    }
+
+    for (auto& fut : futures) {
+      fut.wait();
+    }
+    return;
+  }
+
+ private:
+  struct alignas(64) Task_ {
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool has_task = false;
+    std::packaged_task<void()> task;
+  };
+
+  Task_* tasks_;
+  std::vector<std::thread> threads_;
+  std::atomic<bool> finished_{false};
+};
+
+}  // namespace
+
+template <typename T>
+class HnswLib : public ANN<T> {
+ public:
+  // https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+  struct BuildParam {
+    int M;
+    int ef_construction;
+    int num_threads{1};
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads{1};
+  };
+
+  HnswLib(Metric metric, int dim, const BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void search(const T* query,
+              int batch_size,
+              int k,
+              size_t* indices,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& path_to_index) const override;
+  void load(const std::string& path_to_index) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Host;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+
+ private:
+  void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
+
+  std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
+  std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
+
+  using ANN<T>::metric_;
+  using ANN<T>::dim_;
+  int ef_construction_;
+  int m_;
+  int num_threads_;
+  std::unique_ptr<FixedThreadPool> thread_pool_;
+};
+
+template <typename T>
+HnswLib<T>::HnswLib(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric, dim)
+{
+  assert(dim_ > 0);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t>);
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (metric_ != Metric::kEuclidean) {
+      throw std::runtime_error("hnswlib<uint8_t> only supports Euclidean distance");
+    }
+  }
+
+  ef_construction_ = param.ef_construction;
+  m_               = param.M;
+  num_threads_     = param.num_threads;
+}
+
+template <typename T>
+void HnswLib<T>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+  }
+
+  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), nrow, m_, ef_construction_);
+
+  thread_pool_                  = std::make_unique<FixedThreadPool>(num_threads_);
+  const size_t items_per_thread = nrow / (num_threads_ + 1);
+
+  thread_pool_->submit(
+    [&](size_t i) {
+      if (i < items_per_thread && i % 10000 == 0) {
+        char buf[20];
+        std::time_t now = std::time(nullptr);
+        std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+
+        printf("%s building %zu / %zu\n", buf, i, items_per_thread);
+        fflush(stdout);
+      }
+
+      appr_alg_->addPoint(dataset + i * dim_, i);
+    },
+    nrow);
+}
+
+template <typename T>
+void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
+{
+  auto param     = dynamic_cast<const SearchParam&>(param_);
+  appr_alg_->ef_ = param.ef;
+
+  if (!thread_pool_ || num_threads_ != param.num_threads) {
+    num_threads_ = param.num_threads;
+    thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
+  }
+}
+
+template <typename T>
+void HnswLib<T>::search(
+  const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const
+{
+  thread_pool_->submit(
+    [&](int i) {
+      get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k);
+    },
+    batch_size);
+}
+
+template <typename T>
+void HnswLib<T>::save(const std::string& path_to_index) const
+{
+  appr_alg_->saveIndex(std::string(path_to_index));
+}
+
+template <typename T>
+void HnswLib<T>::load(const std::string& path_to_index)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+  }
+
+  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), path_to_index);
+}
+
+template <typename T>
+void HnswLib<T>::get_search_knn_results_(const T* query,
+                                         int k,
+                                         size_t* indices,
+                                         float* distances) const
+{
+  auto result = appr_alg_->searchKnn(query, k);
+  assert(result.size() >= static_cast<size_t>(k));
+
+  for (int i = k - 1; i >= 0; --i) {
+    indices[i]   = result.top().second;
+    distances[i] = result.top().first;
+    result.pop();
+  }
+}
+
+}  // namespace cuann
+
+#endif
diff --git a/cpp/cuann_bench/src/multigpu.cuh b/cpp/cuann_bench/src/multigpu.cuh
new file mode 100644
index 0000000000..0061298436
--- /dev/null
+++ b/cpp/cuann_bench/src/multigpu.cuh
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MULTIGPU_H_
+#define MULTIGPU_H_
+
+#include <nccl.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cuivfl/src/topk/warp_sort_topk.cuh>
+#include <fstream>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "ann.h"
+#include "cudart_util.h"
+
+#define NCCLCHECK(cmd)                                                                      \
+  do {                                                                                      \
+    ncclResult_t r = cmd;                                                                   \
+    if (r != ncclSuccess) {                                                                 \
+      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r)); \
+      exit(EXIT_FAILURE);                                                                   \
+    }                                                                                       \
+  } while (0)
+
+namespace {
+
+__global__ void add_index_offset_kernel(size_t* arr, size_t len, size_t offset)
+{
+  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < len) arr[id] += offset;
+}
+
+template <typename T>
+__global__ void reset_search_data_placement_kernel(
+  T* arr, T* from, int len, int k, int batch_size, int dev_cnt)
+{
+  size_t id       = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t dev      = id / (k * batch_size);
+  size_t batch_id = id % (k * batch_size) / k;
+  size_t to_id    = batch_id * k * dev_cnt + dev * k + id % k;
+  if (id < len) arr[to_id] = from[id];
+}
+
+template <typename T>
+constexpr ncclDataType_t get_nccl_datatype()
+{
+  if (std::is_same_v<T, float>) {
+    static_assert(sizeof(float) == 4, "float size is not 32 bits");
+    return ncclFloat32;
+  }
+  if (std::is_same_v<T, uint64_t>) return ncclUint64;
+  if (std::is_same_v<T, int8_t>) return ncclInt8;
+  if (std::is_same_v<T, uint8_t>) return ncclUint8;
+  if (std::is_same_v<T, __half>) return ncclFloat16;
+  throw std::runtime_error("no supported nccl datatype");
+}
+
+class DeviceRestorer {
+ public:
+  DeviceRestorer() { ANN_CUDA_CHECK(cudaGetDevice(&cur_dev)); }
+  ~DeviceRestorer() { ANN_CUDA_CHECK(cudaSetDevice(cur_dev)); }
+
+ private:
+  int cur_dev;
+};
+
+}  // namespace
+
+namespace cuann {
+
+template <typename T, typename Algo>
+class MultiGpuANN : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  MultiGpuANN(Metric metric,
+              int dim,
+              const typename Algo::BuildParam& param,
+              const std::vector<int>& dev_list);
+
+  ~MultiGpuANN();
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& file) const override;
+  void load(const std::string& file) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    if (dev_ann_property_.dataset_memory_type == MemoryType::Host) {
+      property.dataset_memory_type = MemoryType::Host;
+    } else if (dev_ann_property_.dataset_memory_type == MemoryType::Device) {
+      property.dataset_memory_type = MemoryType::HostMmap;
+    } else {
+      throw std::runtime_error("multigpu: invalid device algo dataset memory type");
+    }
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = dev_ann_property_.need_dataset_when_search;
+    return property;
+  }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+ private:
+  void distribute_dataset_(const T* dataset, size_t nrow);
+  void add_index_offset_(size_t* arr, size_t len, size_t offset, cudaStream_t stream) const;
+  void set_wait_for_all_streams_(cudaStream_t stream) const;
+  template <typename U>
+  void reset_search_data_placement_(
+    U* arr, U* from, int k, int batch_size, size_t all_result_size, cudaStream_t stream) const;
+
+  const static int block_size_ = 256;
+  using ANN<T>::dim_;
+  std::vector<cudaEvent_t> event_;
+  std::vector<std::unique_ptr<Algo>> dev_ann_interface_;
+  AlgoProperty dev_ann_property_;
+  std::vector<int> dev_id_;
+  std::vector<T*> d_data_;
+  std::vector<cudaStream_t> dev_stream_;
+  std::vector<cudaMemPool_t> mempool_;
+  std::vector<ncclComm_t> comms_;
+  std::vector<size_t> dev_data_offset_;
+  int dev_cnt_;
+  size_t nrow_;
+};
+
+template <typename T, typename Algo>
+MultiGpuANN<T, Algo>::MultiGpuANN(Metric metric,
+                                  int dim,
+                                  const typename Algo::BuildParam& param,
+                                  const std::vector<int>& dev_list)
+  : ANN<T>(metric, dim),
+    dev_cnt_(dev_list.size()),
+    dev_ann_interface_(dev_list.size()),
+    dev_id_(dev_list),
+    d_data_(dev_list.size()),
+    dev_stream_(dev_list.size()),
+    event_(dev_list.size()),
+    mempool_(dev_list.size()),
+    comms_(dev_list.size()),
+    dev_data_offset_(dev_list.size())
+{
+  DeviceRestorer restore_dev;
+  uint64_t threshold = UINT64_MAX;
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    ANN_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool_[i], dev_id_[i]));
+    ANN_CUDA_CHECK(
+      cudaMemPoolSetAttribute(mempool_[i], cudaMemPoolAttrReleaseThreshold, &threshold));
+    std::vector<cudaMemAccessDesc> desc;
+    for (int j = 0; j < dev_cnt_; j++) {
+      if (i == j) continue;
+      cudaMemAccessDesc tmp_desc;
+      tmp_desc.location.type = cudaMemLocationTypeDevice;
+      tmp_desc.location.id   = dev_id_[j];
+      tmp_desc.flags         = cudaMemAccessFlagsProtReadWrite;
+      desc.push_back(tmp_desc);
+    }
+    ANN_CUDA_CHECK(cudaMemPoolSetAccess(mempool_[i], desc.data(), desc.size()));
+    ANN_CUDA_CHECK(cudaStreamCreate(&dev_stream_[i]));
+    ANN_CUDA_CHECK(cudaEventCreate(&event_[i], cudaEventDisableTiming));
+    dev_ann_interface_[i] = std::make_unique<Algo>(metric, dim, param);
+  }
+  NCCLCHECK(ncclCommInitAll(comms_.data(), dev_cnt_, dev_id_.data()));
+
+  dev_ann_property_ = dev_ann_interface_[0]->get_property();
+  if (dev_ann_property_.query_memory_type != MemoryType::Device) {
+    throw std::runtime_error("multigpu: query_memory_type of dev_algo must be DEVICE!");
+  }
+}
+
+template <typename T, typename Algo>
+MultiGpuANN<T, Algo>::~MultiGpuANN()
+{
+  DeviceRestorer restore_dev;
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    if (d_data_[i] && dev_ann_property_.dataset_memory_type == MemoryType::Device) {
+      ANN_CUDA_CHECK(cudaFree(d_data_[i]));
+    }
+    ANN_CUDA_CHECK(cudaStreamDestroy(dev_stream_[i]));
+    ANN_CUDA_CHECK(cudaEventDestroy(event_[i]));
+    NCCLCHECK(ncclCommDestroy(comms_[i]));
+  }
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::build(const T* dataset, size_t nrow, cudaStream_t stream)
+{
+  DeviceRestorer restore_dev;
+  distribute_dataset_(dataset, nrow);
+  nrow_ = nrow;
+
+  std::vector<std::thread> threads;
+
+  size_t basic_size = nrow / dev_cnt_;
+  size_t offset     = 0;
+  int mod           = nrow % dev_cnt_;
+  for (int i = 0; i < dev_cnt_; i++) {
+    size_t data_size = basic_size + (mod > i ? 1 : 0);
+    threads.emplace_back([&, i, data_size]() {
+      ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+      dev_ann_interface_[i]->build(d_data_[i], data_size, dev_stream_[i]);
+    });
+    dev_data_offset_[i] = offset;
+    offset += data_size;
+  }
+  for (auto& it : threads)
+    it.join();
+
+  set_wait_for_all_streams_(stream);
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::set_search_param(const AnnSearchParam& param)
+{
+  DeviceRestorer restore_dev;
+  auto search_param = dynamic_cast<const typename Algo::SearchParam&>(param);
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    dev_ann_interface_[i]->set_search_param(search_param);
+  }
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::search(const T* queries,
+                                  int batch_size,
+                                  int k,
+                                  size_t* neighbors,
+                                  float* distances,
+                                  cudaStream_t stream) const
+{
+  DeviceRestorer restore_dev;
+
+  std::vector<T*> d_queries(dev_cnt_);
+  std::vector<size_t*> d_neighbors(dev_cnt_);
+  std::vector<float*> d_distances(dev_cnt_);
+
+  float* candidate_distances;
+  float* result_distances;
+  size_t* candidate_neighbors;
+  size_t* result_neighbors;
+
+  int cur_dev;
+  ANN_CUDA_CHECK(cudaGetDevice(&cur_dev));
+
+  auto cur_dev_it = std::find(dev_id_.begin(), dev_id_.end(), cur_dev);
+  if (cur_dev_it == dev_id_.end()) {
+    throw std::runtime_error("current device is not in dev_list!");
+  }
+  int cur_dev_id = cur_dev_it - dev_id_.begin();
+
+  size_t single_dev_result_size = static_cast<size_t>(k) * batch_size;
+  size_t all_result_size        = single_dev_result_size * dev_cnt_;
+
+  ANN_CUDA_CHECK(cudaMallocAsync(
+    &candidate_distances, all_result_size * sizeof(float), dev_stream_[cur_dev_id]));
+  ANN_CUDA_CHECK(cudaMallocAsync(
+    &candidate_neighbors, all_result_size * sizeof(size_t), dev_stream_[cur_dev_id]));
+  ANN_CUDA_CHECK(
+    cudaMallocAsync(&result_distances, all_result_size * sizeof(float), dev_stream_[cur_dev_id]));
+  ANN_CUDA_CHECK(
+    cudaMallocAsync(&result_neighbors, all_result_size * sizeof(size_t), dev_stream_[cur_dev_id]));
+
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    ANN_CUDA_CHECK(cudaMallocAsync(&d_queries[i], batch_size * dim_ * sizeof(T), dev_stream_[i]));
+    ANN_CUDA_CHECK(
+      cudaMallocAsync(&d_neighbors[i], single_dev_result_size * sizeof(size_t), dev_stream_[i]));
+    ANN_CUDA_CHECK(
+      cudaMallocAsync(&d_distances[i], single_dev_result_size * sizeof(float), dev_stream_[i]));
+  }
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < dev_cnt_; i++) {
+    NCCLCHECK(ncclBroadcast(queries,
+                            d_queries[i],
+                            batch_size * dim_,
+                            get_nccl_datatype<T>(),
+                            cur_dev_id,
+                            comms_[i],
+                            dev_stream_[i]));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  std::vector<std::thread> threads;
+
+  for (int i = 0; i < dev_cnt_; i++) {
+    threads.emplace_back([&, i]() {
+      ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+      dev_ann_interface_[i]->search(
+        d_queries[i], batch_size, k, d_neighbors[i], d_distances[i], dev_stream_[i]);
+      add_index_offset_(
+        d_neighbors[i], single_dev_result_size, dev_data_offset_[i], dev_stream_[i]);
+    });
+  }
+
+  for (auto& it : threads)
+    it.join();
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < dev_cnt_; i++) {
+    NCCLCHECK(ncclRecv(result_distances + i * single_dev_result_size,
+                       single_dev_result_size,
+                       get_nccl_datatype<float>(),
+                       i,
+                       comms_[cur_dev_id],
+                       dev_stream_[cur_dev_id]));
+    NCCLCHECK(ncclRecv(result_neighbors + i * single_dev_result_size,
+                       single_dev_result_size,
+                       get_nccl_datatype<size_t>(),
+                       i,
+                       comms_[cur_dev_id],
+                       dev_stream_[cur_dev_id]));
+  }
+  for (int i = 0; i < dev_cnt_; i++) {
+    NCCLCHECK(ncclSend(d_distances[i],
+                       single_dev_result_size,
+                       get_nccl_datatype<float>(),
+                       cur_dev_id,
+                       comms_[i],
+                       dev_stream_[i]));
+    NCCLCHECK(ncclSend(d_neighbors[i],
+                       single_dev_result_size,
+                       get_nccl_datatype<size_t>(),
+                       cur_dev_id,
+                       comms_[i],
+                       dev_stream_[i]));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  set_wait_for_all_streams_(stream);
+
+  ANN_CUDA_CHECK(cudaSetDevice(dev_id_[cur_dev_id]));
+
+  reset_search_data_placement_(
+    candidate_distances, result_distances, k, batch_size, all_result_size, stream);
+  reset_search_data_placement_(
+    candidate_neighbors, result_neighbors, k, batch_size, all_result_size, stream);
+
+  void* warp_sort_topk_buf = nullptr;
+  size_t buf_size          = 0;
+
+  nv::warp_sort_topk<float, size_t>(nullptr,
+                                    buf_size,
+                                    candidate_distances,
+                                    candidate_neighbors,
+                                    batch_size,
+                                    k * dev_cnt_,
+                                    k,
+                                    distances,
+                                    neighbors,
+                                    false,
+                                    stream);
+  ANN_CUDA_CHECK(cudaMallocAsync(&warp_sort_topk_buf, buf_size, stream));
+  nv::warp_sort_topk<float, size_t>(warp_sort_topk_buf,
+                                    buf_size,
+                                    candidate_distances,
+                                    candidate_neighbors,
+                                    batch_size,
+                                    k * dev_cnt_,
+                                    k,
+                                    distances,
+                                    neighbors,
+                                    false,
+                                    stream);
+
+  ANN_CUDA_CHECK(cudaFreeAsync(warp_sort_topk_buf, stream));
+  ANN_CUDA_CHECK(cudaFreeAsync(candidate_neighbors, stream));
+  ANN_CUDA_CHECK(cudaFreeAsync(candidate_distances, stream));
+  ANN_CUDA_CHECK(cudaFreeAsync(result_neighbors, stream));
+  ANN_CUDA_CHECK(cudaFreeAsync(result_distances, stream));
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    ANN_CUDA_CHECK(cudaFreeAsync(d_queries[i], stream));
+    ANN_CUDA_CHECK(cudaFreeAsync(d_neighbors[i], stream));
+    ANN_CUDA_CHECK(cudaFreeAsync(d_distances[i], stream));
+  }
+  ANN_CUDA_CHECK_LAST_ERROR()
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::save(const std::string& file) const
+{
+  DeviceRestorer restore_dev;
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    dev_ann_interface_[i]->save(file + "_" + std::to_string(i));
+  }
+  std::ofstream ofs(file);
+  if (!ofs) { throw std::runtime_error("can't open index file: " + file); }
+  ofs << nrow_ << '\n';
+  for (auto it : dev_data_offset_)
+    ofs << it << '\n';
+  ofs.close();
+  if (!ofs) { throw std::runtime_error("can't write to index file: " + file); }
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::load(const std::string& file)
+{
+  DeviceRestorer restore_dev;
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    dev_ann_interface_[i]->load(file + "_" + std::to_string(i));
+  }
+  std::ifstream ifs(file);
+  if (!ifs) { throw std::runtime_error("can't open index file: " + file); }
+  ifs >> nrow_;
+  for (auto& it : dev_data_offset_)
+    ifs >> it;
+  ifs.close();
+  if (!ifs) { throw std::runtime_error("can't read from index file: " + file); }
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  DeviceRestorer restore_dev;
+  distribute_dataset_(dataset, nrow);
+  size_t basic_size = nrow / dev_cnt_;
+  size_t offset     = 0;
+  int mod           = nrow % dev_cnt_;
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    size_t data_size = basic_size + (mod > i ? 1 : 0);
+    dev_ann_interface_[i]->set_search_dataset(d_data_[i], data_size);
+    offset += data_size;
+  }
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::distribute_dataset_(const T* dataset, size_t nrow)
+{
+  size_t basic_size = nrow / dev_cnt_;
+  size_t offset     = 0;
+  int mod           = nrow % dev_cnt_;
+  for (int i = 0; i < dev_cnt_; i++) {
+    size_t data_size = (basic_size + (mod > i ? 1 : 0)) * dim_;
+    if (dev_ann_property_.dataset_memory_type == MemoryType::Device) {
+      ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+      ANN_CUDA_CHECK(cudaMalloc(&d_data_[i], data_size * sizeof(T)));
+      ANN_CUDA_CHECK(cudaMemcpyAsync(d_data_[i],
+                                     dataset + offset,
+                                     data_size * sizeof(T),
+                                     cudaMemcpyHostToDevice,
+                                     dev_stream_[i]));
+    } else {
+      d_data_[i] = const_cast<T*>(dataset) + offset;
+    }
+    offset += data_size;
+  }
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::add_index_offset_(size_t* arr,
+                                             size_t len,
+                                             size_t offset,
+                                             cudaStream_t stream) const
+{
+  add_index_offset_kernel<<<(len + block_size_ - 1) / block_size_, block_size_, 0, stream>>>(
+    arr, len, offset);
+}
+
+template <typename T, typename Algo>
+void MultiGpuANN<T, Algo>::set_wait_for_all_streams_(cudaStream_t stream) const
+{
+  for (int i = 0; i < dev_cnt_; i++) {
+    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
+    ANN_CUDA_CHECK(cudaEventRecord(event_[i], dev_stream_[i]));
+    ANN_CUDA_CHECK(cudaStreamWaitEvent(stream, event_[i], 0));
+  }
+}
+
+template <typename T, typename Algo>
+template <typename U>
+void MultiGpuANN<T, Algo>::reset_search_data_placement_(
+  U* arr, U* from, int k, int batch_size, size_t all_result_size, cudaStream_t stream) const
+{
+  reset_search_data_placement_kernel<<<(all_result_size + block_size_ - 1) / block_size_,
+                                       block_size_,
+                                       0,
+                                       stream>>>(
+    arr, from, all_result_size, k, batch_size, dev_cnt_);
+}
+
+}  // namespace cuann
+
+#endif
diff --git a/cpp/cuann_bench/src/raft_cuann_utils.h b/cpp/cuann_bench/src/raft_cuann_utils.h
new file mode 100644
index 0000000000..0e3e78cad3
--- /dev/null
+++ b/cpp/cuann_bench/src/raft_cuann_utils.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RAFT_CUANN_UTILS_H_
+#define RAFT_CUANN_UTILS_H_
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace cuann {
+
+inline raft::distance::DistanceType parse_metric_type(cuann::Metric metric)
+{
+  if (metric == cuann::Metric::kInnerProduct) {
+    return raft::distance::DistanceType::InnerProduct;
+  } else if (metric == cuann::Metric::kEuclidean) {
+    // Even for L2 expanded RAFT IVF Flat uses unexpanded formula
+    return raft::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+}  // namespace cuann
+
+#endif
\ No newline at end of file
diff --git a/cpp/cuann_bench/src/raft_ivf_flat.cu b/cpp/cuann_bench/src/raft_ivf_flat.cu
new file mode 100644
index 0000000000..80b4f279cf
--- /dev/null
+++ b/cpp/cuann_bench/src/raft_ivf_flat.cu
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_ivf_flat_wrapper.h"
+
+namespace cuann {
+template class RaftIvfFlatGpu<float, uint64_t>;
+template class RaftIvfFlatGpu<uint8_t, uint64_t>;
+}  // namespace cuann
\ No newline at end of file
diff --git a/cpp/cuann_bench/src/raft_ivf_flat_wrapper.h b/cpp/cuann_bench/src/raft_ivf_flat_wrapper.h
new file mode 100644
index 0000000000..e1f57d3c22
--- /dev/null
+++ b/cpp/cuann_bench/src/raft_ivf_flat_wrapper.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RAFT_IVF_FLAT_WRAPPER_H_
+#define RAFT_IVF_FLAT_WRAPPER_H_
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/spatial/knn/detail/ivf_flat_build.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "ann.h"
+#include "cudart_util.h"
+#include "raft_cuann_utils.h"
+
+namespace cuann {
+
+template <typename T, typename IdxT>
+class RaftIvfFlatGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    raft::neighbors::ivf_flat::search_params ivf_flat_params;
+  };
+
+  using BuildParam = raft::neighbors::ivf_flat::index_params;
+
+  RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  BuildParam index_params_;
+  raft::neighbors::ivf_flat::search_params search_params_;
+  std::optional<raft::neighbors::ivf_flat::index<T, IdxT>> index_;
+  int device_;
+  int dimension_;
+  const int serialization_version = 1;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
+};
+
+template <typename T, typename IdxT>
+RaftIvfFlatGpu<T, IdxT>::RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param)
+  : ANN<T>(metric, dim),
+    index_params_(param),
+    dimension_(dim),
+    mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+{
+  index_params_.metric = parse_metric_type(metric);
+  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  index_.emplace(
+    raft::neighbors::ivf_flat::build(handle_, index_params_, dataset, IdxT(nrow), dimension_));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::set_search_param(const AnnSearchParam& param)
+{
+  auto search_param = dynamic_cast<const SearchParam&>(param);
+  search_params_    = search_param.ivf_flat_params;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::save(const std::string& file) const
+{
+  raft::spatial::knn::ivf_flat::detail::serialize(handle_, file, *index_);
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::load(const std::string& file)
+{
+  index_ = raft::spatial::knn::ivf_flat::detail::deserialize<T, IdxT>(handle_, file);
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::search(
+  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+{
+  rmm::mr::device_memory_resource* mr_ptr = &const_cast<RaftIvfFlatGpu*>(this)->mr_;
+  static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
+  raft::neighbors::ivf_flat::search(
+    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances, mr_ptr);
+  handle_.sync_stream();
+  return;
+}
+}  // namespace cuann
+#endif
diff --git a/cpp/cuann_bench/src/raft_ivf_pq.cu b/cpp/cuann_bench/src/raft_ivf_pq.cu
new file mode 100644
index 0000000000..0369edfc56
--- /dev/null
+++ b/cpp/cuann_bench/src/raft_ivf_pq.cu
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_ivf_pq_wrapper.h"
+
+namespace cuann {
+template class RaftIvfPQ<float, uint64_t>;
+template class RaftIvfPQ<uint8_t, uint64_t>;
+}  // namespace cuann
diff --git a/cpp/cuann_bench/src/raft_ivf_pq_wrapper.h b/cpp/cuann_bench/src/raft_ivf_pq_wrapper.h
new file mode 100644
index 0000000000..0611b291d5
--- /dev/null
+++ b/cpp/cuann_bench/src/raft_ivf_pq_wrapper.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RAFT_IVF_PQ_WRAPPER_H_
+#define RAFT_IVF_PQ_WRAPPER_H_
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+#include <raft_runtime/neighbors/refine.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <type_traits>
+
+#include "ann.h"
+#include "cudart_util.h"
+#include "raft_cuann_utils.h"
+
+namespace cuann {
+
+template <typename T, typename IdxT>
+class RaftIvfPQ : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    raft::neighbors::ivf_pq::search_params pq_param;
+  };
+
+  using BuildParam = raft::neighbors::ivf_pq::index_params;
+
+  RaftIvfPQ(Metric metric, int dim, const BuildParam& param, float refine_ratio);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void set_search_dataset(const T* dataset, IdxT nrow) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;  // actually it is only used during refinement
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  BuildParam index_params_;
+  raft::neighbors::ivf_pq::search_params search_params_;
+  std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
+  int device_;
+  int dimension_;
+  float refine_ratio_             = 1.0;
+  const int serialization_version = 1;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
+  raft::device_matrix_view<const T, IdxT> dataset_;
+};
+template <typename T, typename IdxT>
+RaftIvfPQ<T, IdxT>::RaftIvfPQ(Metric metric, int dim, const BuildParam& param, float refine_ratio)
+  : ANN<T>(metric, dim),
+    index_params_(param),
+    dimension_(dim),
+    refine_ratio_(refine_ratio),
+    mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+{
+  index_params_.metric = parse_metric_type(metric);
+  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::save(const std::string& file) const
+{
+  raft::runtime::neighbors::ivf_pq::serialize(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::load(const std::string& file)
+{
+  auto index_tmp = raft::neighbors::ivf_pq::index<IdxT>(handle_, index_params_, dimension_);
+  raft::runtime::neighbors::ivf_pq::deserialize(handle_, file, &index_tmp);
+  index_.emplace(std::move(index_tmp));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  index_.emplace(raft::runtime::neighbors::ivf_pq::build(
+    handle_, index_params_, dataset, IdxT(nrow), dimension_));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::set_search_param(const AnnSearchParam& param)
+{
+  auto search_param = dynamic_cast<const SearchParam&>(param);
+  search_params_    = search_param.pq_param;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, IdxT nrow)
+{
+  dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::search(const T* queries,
+                                int batch_size,
+                                int k,
+                                size_t* neighbors,
+                                float* distances,
+                                cudaStream_t stream) const
+{
+  // raft::logger::get(raft::RAFT_NAME).set_level(RAFT_LEVEL_INFO);
+
+  rmm::mr::device_memory_resource* mr_ptr = &const_cast<RaftIvfPQ*>(this)->mr_;
+  if (refine_ratio_ > 1.0f) {
+    uint32_t k0        = static_cast<uint32_t>(refine_ratio_ * k);
+    auto distances_tmp = raft::make_device_matrix<float, IdxT>(handle_, batch_size, k0);
+    auto candidates    = raft::make_device_matrix<IdxT, IdxT>(handle_, batch_size, k0);
+
+    raft::runtime::neighbors::ivf_pq::search(handle_,
+                                             search_params_,
+                                             *index_,
+                                             queries,
+                                             batch_size,
+                                             k0,
+                                             candidates.data_handle(),
+                                             distances_tmp.data_handle(),
+                                             mr_ptr);
+
+    if (get_property().dataset_memory_type == MemoryType::Device) {
+      auto queries_v =
+        raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>(neighbors, batch_size, k);
+      auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_,
+                                       queries_v,
+                                       candidates.view(),
+                                       neighbors_v,
+                                       distances_v,
+                                       index_->metric());
+    } else {
+      auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
+      auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
+
+      raft::copy(queries_host.data_handle(), queries, queries_host.size(), handle_.get_stream());
+      raft::copy(candidates_host.data_handle(),
+                 candidates.data_handle(),
+                 candidates_host.size(),
+                 handle_.get_stream());
+
+      auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
+        dataset_.data_handle(), batch_size, index_->dim());
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_v,
+                                       queries_host.view(),
+                                       candidates_host.view(),
+                                       neighbors_host.view(),
+                                       distances_host.view(),
+                                       index_->metric());
+
+      raft::copy(
+        neighbors, neighbors_host.data_handle(), neighbors_host.size(), handle_.get_stream());
+      raft::copy(
+        distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream());
+    }
+  } else {
+    raft::runtime::neighbors::ivf_pq::search(handle_,
+                                             search_params_,
+                                             *index_,
+                                             queries,
+                                             batch_size,
+                                             k,
+                                             (IdxT*)neighbors,
+                                             distances,
+                                             mr_ptr);
+  }
+  handle_.sync_stream();
+  return;
+}
+}  // namespace cuann
+
+#endif
diff --git a/cpp/cuann_bench/src/raft_wrapper.h b/cpp/cuann_bench/src/raft_wrapper.h
new file mode 100644
index 0000000000..01f6d4e4fe
--- /dev/null
+++ b/cpp/cuann_bench/src/raft_wrapper.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RAFT_WRAPPER_H_
+#define RAFT_WRAPPER_H_
+
+#include <cassert>
+#include <memory>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "ann.h"
+
+namespace raft_temp {
+
+inline raft::distance::DistanceType parse_metric_type(cuann::Metric metric)
+{
+  if (metric == cuann::Metric::kInnerProduct) {
+    return raft::distance::DistanceType::InnerProduct;
+  } else if (metric == cuann::Metric::kEuclidean) {
+    return raft::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+
+}  // namespace raft_temp
+
+namespace cuann {
+
+// brute force fused L2 KNN - RAFT
+template <typename T>
+class RaftGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  RaftGpu(Metric metric, int dim);
+
+  void build(const T*, size_t, cudaStream_t) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const final;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;
+    return property;
+  }
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+  void save(const std::string& file) const override;
+  void load(const std::string&) override { return; };
+
+ protected:
+  raft::distance::DistanceType metric_type_;
+  int device_;
+  const T* dataset_;
+  size_t nrow_;
+};
+
+template <typename T>
+RaftGpu<T>::RaftGpu(Metric metric, int dim)
+  : ANN<T>(metric, dim), metric_type_(raft_temp::parse_metric_type(metric))
+{
+  static_assert(std::is_same_v<T, float>, "raft support only float type");
+  assert(metric_type_ == raft::distance::DistanceType::L2Expanded);
+  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void RaftGpu<T>::build(const T*, size_t, cudaStream_t)
+{
+  // as this is brute force algo so no index building required
+  return;
+}
+
+template <typename T>
+void RaftGpu<T>::set_search_param(const AnnSearchParam&)
+{
+  // Nothing to set here as it is brute force implementation
+}
+
+template <typename T>
+void RaftGpu<T>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = dataset;
+  nrow_    = nrow;
+}
+
+template <typename T>
+void RaftGpu<T>::save(const std::string& file) const
+{
+  // create a empty index file as no index to store.
+  std::fstream fp;
+  fp.open(file.c_str(), std::ios::out);
+  if (!fp) {
+    printf("Error in creating file!!!\n");
+    ;
+    return;
+  }
+  fp.close();
+}
+
+template <typename T>
+void RaftGpu<T>::search(const T* queries,
+                        int batch_size,
+                        int k,
+                        size_t* neighbors,
+                        float* distances,
+                        cudaStream_t stream) const
+{
+  raft::spatial::knn::detail::fusedL2Knn(this->dim_,
+                                         reinterpret_cast<int64_t*>(neighbors),
+                                         distances,
+                                         dataset_,
+                                         queries,
+                                         nrow_,
+                                         static_cast<size_t>(batch_size),
+                                         k,
+                                         true,
+                                         true,
+                                         stream,
+                                         metric_type_);
+}
+
+}  // namespace cuann
+
+#endif
diff --git a/cpp/cuann_bench/src/util.cpp b/cpp/cuann_bench/src/util.cpp
new file mode 100644
index 0000000000..3225e16e78
--- /dev/null
+++ b/cpp/cuann_bench/src/util.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "util.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <cstring>
+#include <sstream>
+
+namespace benchmark {
+
+std::vector<std::string> split(const std::string& s, char delimiter)
+{
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream iss(s);
+  while (getline(iss, token, delimiter)) {
+    if (!token.empty()) { tokens.push_back(token); }
+  }
+  return tokens;
+}
+
+bool file_exists(const std::string& filename)
+{
+  struct stat statbuf;
+  if (stat(filename.c_str(), &statbuf) != 0) { return false; }
+  return S_ISREG(statbuf.st_mode);
+}
+
+bool dir_exists(const std::string& dir)
+{
+  struct stat statbuf;
+  if (stat(dir.c_str(), &statbuf) != 0) { return false; }
+  return S_ISDIR(statbuf.st_mode);
+}
+
+bool create_dir(const std::string& dir)
+{
+  const auto path = split(dir, '/');
+
+  std::string cwd;
+  if (!dir.empty() && dir[0] == '/') { cwd += '/'; }
+
+  for (const auto& p : path) {
+    cwd += p + "/";
+    if (!dir_exists(cwd)) {
+      int ret = mkdir(cwd.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+      if (ret != 0) { return false; }
+    }
+  }
+  return true;
+}
+
+}  // namespace benchmark
diff --git a/cpp/cuann_bench/src/util.h b/cpp/cuann_bench/src/util.h
new file mode 100644
index 0000000000..e317cee4ac
--- /dev/null
+++ b/cpp/cuann_bench/src/util.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef UTIL_H_
+#define UTIL_H_
+
+#include <chrono>
+#include <cstdio>
+#include <ctime>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace benchmark {
+
+class Timer {
+ public:
+  Timer() { reset(); }
+  void reset() { start_time_ = std::chrono::steady_clock::now(); }
+  float elapsed_ms()
+  {
+    auto end_time = std::chrono::steady_clock::now();
+    auto dur =
+      std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(end_time - start_time_);
+    return dur.count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_time_;
+};
+
+std::vector<std::string> split(const std::string& s, char delimiter);
+
+bool file_exists(const std::string& filename);
+bool dir_exists(const std::string& dir);
+bool create_dir(const std::string& dir);
+
+template <typename... Ts>
+void log_(const char* level, Ts... vs)
+{
+  char buf[20];
+  std::time_t now = std::time(nullptr);
+  std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+  printf("%s [%s] ", buf, level);
+  printf(vs...);
+  printf("\n");
+  fflush(stdout);
+}
+
+template <typename... Ts>
+void log_info(Ts... vs)
+{
+  log_("info", vs...);
+}
+
+template <typename... Ts>
+void log_warn(Ts... vs)
+{
+  log_("warn", vs...);
+}
+
+template <typename... Ts>
+void log_error(Ts... vs)
+{
+  log_("error", vs...);
+}
+
+}  // namespace benchmark
+
+#endif
diff --git a/cpp/cuann_bench/third_party/patches/ggnn.patch b/cpp/cuann_bench/third_party/patches/ggnn.patch
new file mode 100644
index 0000000000..95e1aaff4b
--- /dev/null
+++ b/cpp/cuann_bench/third_party/patches/ggnn.patch
@@ -0,0 +1,206 @@
+diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+index 8cbaf0d..6eb72ac 100644
+--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
++++ b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+@@ -41,7 +41,6 @@ limitations under the License.
+ #include "ggnn/sym/cuda_knn_sym_query_layer.cuh"
+ #include "ggnn/utils/cuda_knn_utils.cuh"
+ #include "ggnn/utils/cuda_knn_constants.cuh"
+-#include "ggnn/utils/cuda_knn_dataset.cuh"
+ 
+ template <typename ValueT>
+ __global__ void divide(ValueT* res, ValueT* input, ValueT N) {
+@@ -98,9 +97,7 @@ struct GGNNGPUInstance {
+   typedef GGNNGraphDevice<KeyT, BaseT, ValueT> GGNNGraphDevice;
+   typedef GGNNGraphHost<KeyT, BaseT, ValueT> GGNNGraphHost;
+ 
+-  const Dataset<KeyT, BaseT, BAddrT>* dataset;
+   GGNNGraphBuffer<KeyT, ValueT>* ggnn_buffer {nullptr};
+-  GGNNQuery<KeyT, ValueT, BaseT> ggnn_query;
+ 
+   // Graph Shards resident on the GPU
+   std::vector<GGNNGraphDevice> ggnn_shards;
+@@ -117,13 +114,12 @@ struct GGNNGPUInstance {
+   // number of shards that need to be processed by this instance
+   const int num_parts;
+ 
+-  GGNNGPUInstance(const int gpu_id, const Dataset<KeyT, BaseT, BAddrT>* dataset,
++  GGNNGPUInstance(const int gpu_id,
+             const int N_shard, const int L,
+             const bool enable_construction, const float tau_build,
+             const int num_parts=1, const int num_cpu_buffers=1) :
+     N_shard{N_shard}, L{L}, tau_build{tau_build},
+-    dataset{dataset}, gpu_id{gpu_id},
+-    ggnn_query{dataset->N_query, D, KQuery, num_parts},
++    gpu_id{gpu_id},
+     num_parts{num_parts}
+   {
+     CHECK_LE(L, MAX_LAYER);
+@@ -135,7 +131,6 @@ struct GGNNGPUInstance {
+       CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!";
+     }
+ 
+-    ggnn_query.loadQueriesAsync(dataset->h_query, 0);
+ 
+     computeGraphParameters();
+ 
+@@ -186,7 +181,7 @@ struct GGNNGPUInstance {
+   }
+ 
+   GGNNGPUInstance(const GGNNGPUInstance& other)
+-   : dataset{nullptr}, ggnn_query{0, D, KQuery},
++   :
+      gpu_id{0}, N_shard{0}, num_parts{0} {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+@@ -305,6 +300,7 @@ struct GGNNGPUInstance {
+ 
+   // io
+ 
++  /*
+   void waitForDiskIO(const int shard_id) {
+     auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
+     if (cpu_buffer.disk_io_thread.joinable())
+@@ -468,11 +464,12 @@ struct GGNNGPUInstance {
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
++  */
+ 
+   // graph operations
+ 
+   template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, bool DIST_STATS = false>
+-  void queryLayer(const int shard_id = 0) const {
++  void queryLayer(const BaseT* d_query, int batch_size, KeyT* d_query_result_ids, ValueT* d_query_result_dists, const int shard_id = 0) const {
+     CHECK_CUDA(cudaSetDevice(gpu_id));
+     const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
+ 
+@@ -482,21 +479,21 @@ struct GGNNGPUInstance {
+ 
+     int* m_dist_statistics = nullptr;
+     if (DIST_STATS)
+-      cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int));
++      cudaMallocManaged(&m_dist_statistics, batch_size * sizeof(int));
+ 
+     QueryKernel query_kernel;
+     query_kernel.d_base = shard.d_base;
+-    query_kernel.d_query = ggnn_query.d_query;
++    query_kernel.d_query = d_query;
+ 
+     query_kernel.d_graph = shard.d_graph;
+-    query_kernel.d_query_results = ggnn_query.d_query_result_ids;
+-    query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists;
++    query_kernel.d_query_results = d_query_result_ids;
++    query_kernel.d_query_results_dists = d_query_result_dists;
+ 
+     query_kernel.d_translation = shard.d_translation;
+ 
+     query_kernel.d_nn1_stats = shard.d_nn1_stats;
+ 
+-    query_kernel.N = dataset->N_query;
++    query_kernel.N = batch_size;
+     query_kernel.N_offset = 0;
+ 
+     query_kernel.d_dist_stats = m_dist_statistics;
+@@ -771,6 +768,16 @@ struct GGNNGPUInstance {
+       sym(layer, shard_id);
+     }
+   }
++
++  void set_stream(cudaStream_t stream) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).stream = stream;
++  }
++
++  void set_base_data(const BaseT* dataset) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).d_base = dataset;
++  }
+ };
+ 
+ #endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+index c94a8f1..781226d 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+@@ -50,7 +50,7 @@ struct GGNNGraphDevice {
+   ValueT* d_nn1_stats;
+ 
+   /// base data pointer for the shard.
+-  BaseT* d_base;
++  const BaseT* d_base;
+ 
+   /// combined memory pool
+   char* d_memory;
+@@ -69,7 +69,9 @@ struct GGNNGraphDevice {
+     const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
+     const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
+     total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
+-    base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    // base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    (void) N;
++    (void) D;
+ 
+     const size_t total_size = base_size+total_graph_size;
+ 
+@@ -86,8 +88,7 @@ struct GGNNGraphDevice {
+     CHECK_CUDA(cudaMalloc(&d_memory, total_size));
+ 
+     size_t pos = 0;
+-    d_base = reinterpret_cast<BaseT*>(d_memory+pos);
+-    pos += base_size;
++    d_base = nullptr;
+     d_graph = reinterpret_cast<KeyT*>(d_memory+pos);
+     pos += graph_size;
+     d_translation = reinterpret_cast<KeyT*>(d_memory+pos);
+@@ -99,14 +100,14 @@ struct GGNNGraphDevice {
+ 
+     CHECK_EQ(pos, total_size);
+ 
+-    CHECK_CUDA(cudaStreamCreate(&stream));
++    // CHECK_CUDA(cudaStreamCreate(&stream));
+ 
+     CHECK_CUDA(cudaPeekAtLastError());
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphDevice(const GGNNGraphDevice& other) {
++  GGNNGraphDevice(const GGNNGraphDevice&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+@@ -116,7 +117,7 @@ struct GGNNGraphDevice {
+   ~GGNNGraphDevice() {
+     cudaFree(d_memory);
+ 
+-    CHECK_CUDA(cudaStreamDestroy(stream));
++    // CHECK_CUDA(cudaStreamDestroy(stream));
+   }
+ };
+ 
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+index 2055f9e..ef5843a 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+@@ -92,7 +92,7 @@ struct GGNNGraphHost {
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphHost(const GGNNGraphHost& other) {
++  GGNNGraphHost(const GGNNGraphHost&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+index 49d76a1..eef69e6 100644
+--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
++++ b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+@@ -22,7 +22,6 @@ limitations under the License.
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ 
+-#include <gflags/gflags.h>
+ #include <cub/cub.cuh>
+ 
+ #include "ggnn/utils/cuda_knn_constants.cuh"
diff --git a/cpp/cuann_bench/third_party/patches/json.patch b/cpp/cuann_bench/third_party/patches/json.patch
new file mode 100644
index 0000000000..83dd56bc16
--- /dev/null
+++ b/cpp/cuann_bench/third_party/patches/json.patch
@@ -0,0 +1,38 @@
+--- nlohmann/json.hpp	2021-05-06 11:40:39.770669693 +0800
++++ nlohmann/json_patched.hpp	2021-06-02 18:46:43.849334466 +0800
+@@ -16607,6 +16607,21 @@
+         }
+     }
+ 
++
++    template <typename NumberType,
++  	    enable_if_t<std::is_signed<NumberType>::value, int> = 0>
++    bool is_negative_number(NumberType x)
++    {
++        return x < 0;
++    }
++
++    template < typename NumberType,
++  	     enable_if_t < std::is_unsigned<NumberType>::value, int > = 0 >
++    bool is_negative_number(NumberType /*unused*/)
++    {
++        return false;
++    }
++
+     /*!
+     @brief dump an integer
+ 
+@@ -16649,12 +16664,11 @@
+         // use a pointer to fill the buffer
+         auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+ 
+-        const bool is_negative = std::is_same<NumberType, number_integer_t>::value && !(x >= 0); // see issue #755
+         number_unsigned_t abs_value;
+ 
+         unsigned int n_chars{};
+ 
+-        if (is_negative)
++        if (is_negative_number(x))
+         {
+             *buffer_ptr = '-';
+             abs_value = remove_sign(static_cast<number_integer_t>(x));
diff --git a/dependencies.yaml b/dependencies.yaml
index 571c9a095a..c9d4e9c81b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -22,6 +22,11 @@ files:
       - cudatoolkit
       - py_version
       - test_python
+  cuann_bench:
+    output: none
+    includes:
+      - cudatoolkit
+      - nn_bench
   checks:
     output: none
     includes:
@@ -79,6 +84,14 @@ dependencies:
       - output_types: [conda]
         packages:
           - clang-tools=11.1.0
+  nn_bench:
+    common:
+      - output_types: [conda]
+        packages:
+          - hnswlib
+          - nlohmann_json
+          - glog
+
   cudatoolkit:
     specific:
       - output_types: conda

From b0a918c34dcb6580bffe51cb51f67562d2ae3bba Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Feb 2023 23:22:02 -0500
Subject: [PATCH 02/39] Getting cuann benchmarks to build

---
 build.sh                                     |   2 +-
 cpp/cmake/thirdparty/get_hnswlib.cmake       |  22 +-
 cpp/cuann_bench/CMakeLists.txt               | 182 +++---
 cpp/cuann_bench/conf/sift-128-euclidean.json |  86 ++-
 cpp/cuann_bench/src/benchmark.cpp            | 555 +++++++++++++++++++
 cpp/cuann_bench/src/factory.cuh              |  36 +-
 cpp/cuann_bench/src/factory.h                | 134 +++++
 7 files changed, 911 insertions(+), 106 deletions(-)
 create mode 100644 cpp/cuann_bench/src/benchmark.cpp
 create mode 100644 cpp/cuann_bench/src/factory.h

diff --git a/build.sh b/build.sh
index 8731ec1020..ae74dc33cd 100755
--- a/build.sh
+++ b/build.sh
@@ -340,7 +340,7 @@ fi
 
 if hasArg cuann_bench || (( ${NUMARGS} == 0 )); then
     BUILD_CUANN_BENCH=ON
-    CMAKE_TARGET="${CMAKE_TARGET};CUANN_BENCH"
+    CMAKE_TARGET="${CMAKE_TARGET};CUANN_BENCH_HNSWLIB"
     ENABLE_NN_DEPENDENCIES=ON
     COMPILE_NN_LIBRARY=ON
     COMPILE_DIST_LIBRARY=ON
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index d4ebaf0729..69ea99a006 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -21,10 +21,26 @@ function(find_and_configure_hnswlib)
 
     set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
 
-    IF ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src )
+    if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib )
+
+        execute_process (
+                COMMAND mkdir hnswlib
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
+
+        execute_process (
+                COMMAND wget https://github.com/nmslib/hnswlib/archive/refs/tags/v0.6.2.zip -O hnswlib-0.6.2.zip
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
+
+        execute_process (
+                COMMAND unzip hnswlib-0.6.2.zip
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
         execute_process (
-                COMMAND git clone "https://github.com/${PKG_FORK}/hnswlib" --branch ${PKG_PINNED_TAG} hnswlib-src
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ )
+                COMMAND mv -f hnswlib-0.6.2/hnswlib/ .
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
+        execute_process (
+                COMMAND rm -r hnswlib-0.6.2 hnswlib-0.6.2.zip
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
+
     endif ()
 endfunction()
 
diff --git a/cpp/cuann_bench/CMakeLists.txt b/cpp/cuann_bench/CMakeLists.txt
index a51417b44b..1f5e79038b 100644
--- a/cpp/cuann_bench/CMakeLists.txt
+++ b/cpp/cuann_bench/CMakeLists.txt
@@ -15,14 +15,14 @@
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
-option(RAFT_CUANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf flat algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" OFF)
+option(RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF)
+option(RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF)
+option(RAFT_CUANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" OFF)
+option(RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" OFF)
+option(RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_CUANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
+option(RAFT_CUANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
 
 set(RAFT_CUANN_BENCH_USE_FAISS OFF)
 if(RAFT_CUANN_BENCH_USE_FAISS_BFKNN
@@ -44,6 +44,12 @@ if(RAFT_CUANN_BENCH_USE_RAFT_BFKNN
   set(RAFT_CUANN_BENCH_USE_RAFT ON)
 endif()
 
+if(NOT RAFT_CUANN_BENCH_USE_RAFT)
+  set(RAFT_COMPILE_DISTANCE_LIBRARY OFF)
+  set(RAFT_COMPILE_NN_LIBRARY OFF)
+  set(RAFT_ENABLE_NN_DEPENDENCIES OFF)
+endif()
+
 option(RAFT_CUANN_BENCH_USE_MULTIGPU "Use multi-gpus (where possible) in benchmarks" OFF)
 
 include(cmake/thirdparty/get_nlohmann_json.cmake)
@@ -56,66 +62,114 @@ if(RAFT_CUANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss.cmake)
 endif()
 
-add_executable(
-  CUANN_BENCH cuann_bench/src/benchmark.cu cuann_bench/src/raft_ivf_flat.cu
-              cuann_bench/src/raft_ivf_pq.cu cuann_bench/src/util.cpp
-)
+function(ConfigureCuannBench)
+
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH LINKS CXXFLAGS INCLUDES)
+
+  cmake_parse_arguments(
+    ConfigureCuannBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
+  )
+
+  set(BENCH_NAME CUANN_BENCH_${ConfigureCuannBench_NAME})
+
+  add_executable(
+    ${BENCH_NAME} ${ConfigureCuannBench_PATH} cuann_bench/src/conf.cpp cuann_bench/src/util.cpp
+  )
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            $<$<BOOL:${RAFT_CUANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
+            ${ConfigureCuannBench_LINKS}
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  set(${ConfigureCuannBench_CXXFLAGS} ${RAFT_CXX_FLAGS} ${ConfigureCuannBench_CXXFLAGS})
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ConfigureCuannBench_CXXFLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  if(RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME})
+    target_compile_definitions(
+      ${BENCH_NAME}
+      PUBLIC
+        RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME}=${RAFT_CUANN_BENCH_USE_}${ConfigureCuannBench_NAME}
+    )
+  endif()
+
+  target_include_directories(
+    ${BENCH_NAME}
+    PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+    PRIVATE ${ConfigureCuannBench_INCLUDES}
+  )
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/cuann_bench
+    EXCLUDE_FROM_ALL
+  )
+endfunction()
 
-target_link_libraries(
-  CUANN_BENCH
-  PRIVATE raft::raft
-          nlohmann_json::nlohmann_json
-          raft_internal
-          $<$<BOOL:${RAFT_CUANN_BENCH_USE_RAFT}>:raft::distance>
-          $<$<BOOL:${RAFT_CUANN_BENCH_USE_RAFT}>:raft::nn>
-          $<$<BOOL:${RAFT_CUANN_BENCH_USE_FAISS}>:faiss::faiss>
-          $<$<BOOL:${RAFT_CUANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
-          # $<$<BOOL:${RAFT_CUANN_BENCH_USE_HNSWLIB}>:hnswlib>
-          Threads::Threads
-          $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-          $<TARGET_NAME_IF_EXISTS:conda_env>
-)
+if(RAFT_CUANN_BENCH_USE_HNSWLIB)
+  ConfigureCuannBench(
+    NAME HNSWLIB PATH cuann_bench/src/benchmark.cpp INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib/hnswlib CXXFLAGS -mavx
+  )
+endif()
 
-set_target_properties(
-  CUANN_BENCH
-  PROPERTIES # set target compile options
-             INSTALL_RPATH "\$ORIGIN/../../../lib"
-             CXX_STANDARD 17
-             CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON
-             POSITION_INDEPENDENT_CODE ON
-             INTERFACE_POSITION_INDEPENDENT_CODE ON
-)
+if(RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ)
+  ConfigureCuannBench(
+    NAME RAFT_IVF_PQ PATH cuann_bench/src/benchmark.cu cuann_bench/src/raft_ivf_pq.cu LINKS
+    raft::distance raft::nn
+  )
+endif()
 
-target_compile_options(
-  CUANN_BENCH PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-)
+if(RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT)
+  ConfigureCuannBench(
+    NAME RAFT_IVF_FLAT PATH cuann_bench/src/benchmark.cu cuann_bench/src/raft_ivf_flat.cu LINKS
+    raft::distance raft::nn
+  )
+endif()
 
-target_compile_definitions(
-  CUANN_BENCH
-  PUBLIC RAFT_CUANN_BENCH_USE_FAISS_BFKNN=${RAFT_CUANN_BENCH_USE_FAISS_BFKNN}
-         RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT=${RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT}
-         RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ=${RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ}
-         RAFT_CUANN_BENCH_USE_RAFT_BFKNN=${RAFT_CUANN_BENCH_USE_RAFT_BFKNN}
-         RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT=${RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT}
-         RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ=${RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ}
-         RAFT_CUANN_BENCH_USE_HNSWLIB=${RAFT_CUANN_BENCH_USE_HNSWLIB}
-         RAFT_CUANN_BENCH_USE_GGNN=${RAFT_CUANN_BENCH_USE_GGNN}
-)
+if(RAFT_CUANN_BENCH_USE_RAFT_BFKNN)
+  ConfigureCuannBench(
+    NAME RAFT_IVF_FLAT PATH cuann_bench/src/benchmark.cu LINKS raft::distance raft::nn
+  )
+endif()
 
-target_include_directories(
-  CUANN_BENCH
-  PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-  PRIVATE
-    "$<BUILD_INTERFACE:$<$<BOOL:${RAFT_CUANN_BENCH_USE_GGNN}>:${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include>>"
-    "$<BUILD_INTERFACE:$<$<BOOL:${RAFT_CUANN_BENCH_USE_HNSWLIB}>:${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib>>"
-)
+if(RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT)
+  ConfigureCuannBench(NAME FAISS_IVF_FLAT PATH cuann_bench/src/benchmark.cu LINKS faiss::faiss)
+endif()
 
-install(
-  TARGETS CUANN_BENCH
-  COMPONENT testing
-  DESTINATION bin/CUANN_BENCH
-  EXCLUDE_FROM_ALL
-)
+if(RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ)
+  ConfigureCuannBench(NAME FAISS_IVF_PQ PATH cuann_bench/src/benchmark.cu LINKS faiss::faiss)
+endif()
+
+if(RAFT_CUANN_BENCH_USE_FAISS_BFKNN)
+  ConfigureCuannBench(NAME FAISS_BFKNN PATH cuann_bench/src/benchmark.cu LINKS faiss::faiss)
+endif()
+
+if(RAFT_CUANN_BENCH_USE_GGNN)
+  ConfigureCuannBench(
+    NAME GGNN PATH cuann_bench/src/benchmark.cu INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include
+  )
+endif()
diff --git a/cpp/cuann_bench/conf/sift-128-euclidean.json b/cpp/cuann_bench/conf/sift-128-euclidean.json
index 081d6cba2c..e759d3133e 100644
--- a/cpp/cuann_bench/conf/sift-128-euclidean.json
+++ b/cpp/cuann_bench/conf/sift-128-euclidean.json
@@ -1,8 +1,8 @@
 {
   "dataset": {
     "name": "sift-128-euclidean",
-    "base_file": "/workspace/rapids/knn/cuann/benchmark/sift-128-euclidean/base.fbin",
-    "query_file": "/workspace/rapids/knn/cuann/benchmark/sift-128-euclidean/query.fbin",
+    "base_file": "/home/cjnolet/workspace/ann_data/sift-128-euclidean/base.fbin",
+    "query_file": "/home/cjnolet/workspace/ann_data/sift-128-euclidean/query.fbin",
     "distance": "euclidean"
   },
   "search_basic_param": {
@@ -12,6 +12,86 @@
   },
   "index": [
     {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M36"
+    },
+
+
+
+      
+	  {
       "name": "raft_bfknn",
       "algo": "raft_bfknn",
       "build_param": {},
@@ -2020,4 +2100,4 @@
       "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist16384"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/cpp/cuann_bench/src/benchmark.cpp b/cpp/cuann_bench/src/benchmark.cpp
new file mode 100644
index 0000000000..674d107efd
--- /dev/null
+++ b/cpp/cuann_bench/src/benchmark.cpp
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef NVTX
+#include <nvToolsExt.h>
+#endif
+#include <unistd.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "conf.h"
+#include "dataset.h"
+#include "factory.h"
+#include "util.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::unordered_set;
+using std::vector;
+using namespace benchmark;
+using cuann::MemoryType;
+
+// supported types: float, half (very few implementations support it), uint8_t, int8_t
+using data_t = float;
+
+bool check_file_exist(const vector<string>& files)
+{
+  bool ret = true;
+  unordered_set<string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) == processed.end() && !file_exists(file)) {
+      log_error("file '%s' doesn't exist or is not a regular file", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
+{
+  bool ret = true;
+  for (const auto& file : files) {
+    if (file_exists(file)) {
+      if (force_overwrite) {
+        log_warn("'%s' already exists, will overwrite it", file.c_str());
+      } else {
+        log_error("'%s' already exists, use '-f' to force overwriting", file.c_str());
+        ret = false;
+      }
+    }
+  }
+  return ret;
+}
+
+bool check_no_duplicate_file(const vector<string>& files)
+{
+  bool ret = true;
+  unordered_set<string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) != processed.end()) {
+      log_error("'%s' occurs more than once as output file, would be overwritten", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+bool mkdir(const vector<string>& dirs)
+{
+  unordered_set<string> processed;
+  for (const auto& dir : dirs) {
+    if (processed.find(dir) == processed.end() && !dir_exists(dir)) {
+      if (create_dir(dir)) {
+        log_info("mkdir '%s'", dir.c_str());
+      } else {
+        log_error("fail to create output directory '%s'", dir.c_str());
+        // won't create any other dir when problem occurs
+        return false;
+      }
+    }
+    processed.insert(dir);
+  }
+  return true;
+}
+
+bool check(const vector<Configuration::Index>& indices, bool build_mode, bool force_overwrite)
+{
+  vector<string> files_should_exist;
+  vector<string> dirs_should_exist;
+  vector<string> output_files;
+  for (const auto& index : indices) {
+    if (build_mode) {
+      output_files.push_back(index.file);
+      output_files.push_back(index.file + ".txt");
+
+      auto pos = index.file.rfind('/');
+      if (pos != string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
+    } else {
+      files_should_exist.push_back(index.file);
+      files_should_exist.push_back(index.file + ".txt");
+
+      output_files.push_back(index.search_result_file + ".0.ibin");
+      output_files.push_back(index.search_result_file + ".0.txt");
+
+      auto pos = index.search_result_file.rfind('/');
+      if (pos != string::npos) {
+        dirs_should_exist.push_back(index.search_result_file.substr(0, pos));
+      }
+    }
+  }
+
+  bool ret = true;
+  if (!check_file_exist(files_should_exist)) { ret = false; }
+  if (!check_file_not_exist(output_files, force_overwrite)) { ret = false; }
+  if (!check_no_duplicate_file(output_files)) { ret = false; }
+  if (ret && !mkdir(dirs_should_exist)) { ret = false; }
+  return ret;
+}
+
+void write_build_info(const string& file_prefix,
+                      const string& dataset,
+                      const string& distance,
+                      const string& name,
+                      const string& algo,
+                      const string& build_param,
+                      float build_time)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "build_time: " << build_time << endl;
+  ofs.close();
+  if (!ofs) { throw std::runtime_error("can't write to build info file: " + file_prefix + ".txt"); }
+}
+
+template <typename T>
+void build(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
+{
+  cudaStream_t stream;
+  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
+
+  log_info(
+    "base set from dataset '%s', #vector = %zu", dataset->name().c_str(), dataset->base_set_size());
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    const T* base_set_ptr = nullptr;
+    if (algo_property.dataset_memory_type == MemoryType::Host) {
+      log_info("%s", "loading base set to memory");
+      base_set_ptr = dataset->base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+      log_info("%s", "mapping base set to memory");
+      base_set_ptr = dataset->mapped_base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+      log_info("%s", "loading base set to GPU");
+      base_set_ptr = dataset->base_set_on_gpu();
+    }
+
+    log_info("building index '%s'", index.name.c_str());
+    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+#ifdef NVTX
+    nvtxRangePush("build");
+#endif
+    Timer timer;
+    algo->build(base_set_ptr, dataset->base_set_size(), stream);
+    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+    float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+    nvtxRangePop();
+#endif
+    log_info("built index in %.2f seconds", elapsed_ms / 1000.0f);
+    ANN_CUDA_CHECK_LAST_ERROR();
+
+    algo->save(index.file);
+    write_build_info(index.file,
+                     dataset->name(),
+                     dataset->distance(),
+                     index.name,
+                     index.algo,
+                     index.build_param.dump(),
+                     elapsed_ms / 1000.0f);
+    log_info("saved index to %s", index.file.c_str());
+  }
+
+  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+void write_search_result(const string& file_prefix,
+                         const string& dataset,
+                         const string& distance,
+                         const string& name,
+                         const string& algo,
+                         const string& build_param,
+                         const string& search_param,
+                         int batch_size,
+                         int run_count,
+                         int k,
+                         float search_time_average,
+                         float search_time_p99,
+                         float search_time_p999,
+                         const int* neighbors,
+                         size_t query_set_size)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "search_param: " << search_param << "\n"
+      << "\n"
+      << "batch_size: " << batch_size << "\n"
+      << "run_count: " << run_count << "\n"
+      << "k: " << k << "\n"
+      << "average_search_time: " << search_time_average << endl;
+  if (search_time_p99 != std::numeric_limits<float>::max()) {
+    ofs << "p99_search_time: " << search_time_p99 << endl;
+  }
+  if (search_time_p999 != std::numeric_limits<float>::max()) {
+    ofs << "p999_search_time: " << search_time_p999 << endl;
+  }
+  ofs.close();
+  if (!ofs) {
+    throw std::runtime_error("can't write to search result file: " + file_prefix + ".txt");
+  }
+
+  BinFile<int> neighbors_file(file_prefix + ".ibin", "w");
+  neighbors_file.write(neighbors, query_set_size, k);
+}
+
+template <typename T>
+void search(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
+{
+  if (indices.empty()) { return; }
+  cudaStream_t stream;
+  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
+
+  log_info("loading query set from dataset '%s', #vector = %zu",
+           dataset->name().c_str(),
+           dataset->query_set_size());
+  const T* query_set = dataset->query_set();
+  // query set is usually much smaller than base set, so load it eagerly
+  const T* d_query_set  = dataset->query_set_on_gpu();
+  size_t query_set_size = dataset->query_set_size();
+
+  // currently all indices has same batch_size, k and run_count
+  const int batch_size = indices[0].batch_size;
+  const int k          = indices[0].k;
+  const int run_count  = indices[0].run_count;
+  log_info(
+    "basic search parameters: batch_size = %d, k = %d, run_count = %d", batch_size, k, run_count);
+  if (query_set_size % batch_size != 0) {
+    log_warn("query set size (%zu) % batch size (%d) != 0, the size of last batch is %zu",
+             query_set_size,
+             batch_size,
+             query_set_size % batch_size);
+  }
+  const size_t num_batches = (query_set_size - 1) / batch_size + 1;
+  size_t* neighbors        = new size_t[query_set_size * k];
+  int* neighbors_buf       = new int[query_set_size * k];
+  float* distances         = new float[query_set_size * k];
+  vector<float> search_times;
+  search_times.reserve(num_batches);
+  size_t* d_neighbors;
+  float* d_distances;
+  ANN_CUDA_CHECK(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
+  ANN_CUDA_CHECK(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str());
+    algo->load(index.file);
+
+    const T* this_query_set = query_set;
+    size_t* this_neighbors  = neighbors;
+    float* this_distances   = distances;
+    if (algo_property.query_memory_type == MemoryType::Device) {
+      this_query_set = d_query_set;
+      this_neighbors = d_neighbors;
+      this_distances = d_distances;
+    }
+
+    if (algo_property.need_dataset_when_search) {
+      log_info("loading base set from dataset '%s', #vector = %zu",
+               dataset->name().c_str(),
+               dataset->base_set_size());
+      const T* base_set_ptr = nullptr;
+      if (algo_property.dataset_memory_type == MemoryType::Host) {
+        log_info("%s", "loading base set to memory");
+        base_set_ptr = dataset->base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+        log_info("%s", "mapping base set to memory");
+        base_set_ptr = dataset->mapped_base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+        log_info("%s", "loading base set to GPU");
+        base_set_ptr = dataset->base_set_on_gpu();
+      }
+      algo->set_search_dataset(base_set_ptr, dataset->base_set_size());
+    }
+
+    for (int i = 0, end_i = index.search_params.size(); i != end_i; ++i) {
+      auto p_param = create_search_param<T>(index.algo, index.search_params[i]);
+      algo->set_search_param(*p_param);
+      log_info("search with param: %s", index.search_params[i].dump().c_str());
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        ANN_CUDA_CHECK(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
+        ANN_CUDA_CHECK(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
+      } else {
+        memset(neighbors, 0, query_set_size * k * sizeof(*neighbors));
+        memset(distances, 0, query_set_size * k * sizeof(*distances));
+      }
+
+      float best_search_time_average = std::numeric_limits<float>::max();
+      float best_search_time_p99     = std::numeric_limits<float>::max();
+      float best_search_time_p999    = std::numeric_limits<float>::max();
+      for (int run = 0; run < run_count; ++run) {
+        log_info("run %d / %d", run + 1, run_count);
+        for (size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
+          size_t row            = batch_id * batch_size;
+          int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
+          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+#ifdef NVTX
+          string nvtx_label = "batch" + to_string(batch_id);
+          if (run_count != 1) { nvtx_label = "run" + to_string(run) + "-" + nvtx_label; }
+          if (batch_id == 10) {
+            run = run_count - 1;
+            break;
+          }
+#endif
+          Timer timer;
+#ifdef NVTX
+          nvtxRangePush(nvtx_label.c_str());
+#endif
+          algo->search(this_query_set + row * dataset->dim(),
+                       actual_batch_size,
+                       k,
+                       this_neighbors + row * k,
+                       this_distances + row * k,
+                       stream);
+          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+          float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+          nvtxRangePop();
+#endif
+          // If the size of the last batch is less than batch_size, don't count it for
+          // search time. But neighbors of the last batch will still be filled, so it's
+          // counted for recall calculation.
+          if (actual_batch_size == batch_size) {
+            search_times.push_back(elapsed_ms / 1000.0f);  // in seconds
+          }
+        }
+
+        float search_time_average =
+          std::accumulate(search_times.cbegin(), search_times.cend(), 0.0f) / search_times.size();
+        best_search_time_average = std::min(best_search_time_average, search_time_average);
+
+        if (search_times.size() >= 100) {
+          std::sort(search_times.begin(), search_times.end());
+
+          auto calc_percentile_pos = [](float percentile, size_t N) {
+            return static_cast<size_t>(std::ceil(percentile / 100.0 * N)) - 1;
+          };
+
+          float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())];
+          best_search_time_p99  = std::min(best_search_time_p99, search_time_p99);
+
+          if (search_times.size() >= 1000) {
+            float search_time_p999 = search_times[calc_percentile_pos(99.9, search_times.size())];
+            best_search_time_p999  = std::min(best_search_time_p999, search_time_p999);
+          }
+        }
+        search_times.clear();
+      }
+      ANN_CUDA_CHECK_LAST_ERROR();
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        ANN_CUDA_CHECK(cudaMemcpy(neighbors,
+                                  d_neighbors,
+                                  query_set_size * k * sizeof(*d_neighbors),
+                                  cudaMemcpyDeviceToHost));
+        ANN_CUDA_CHECK(cudaMemcpy(distances,
+                                  d_distances,
+                                  query_set_size * k * sizeof(*d_distances),
+                                  cudaMemcpyDeviceToHost));
+      }
+
+      for (size_t j = 0; j < query_set_size * k; ++j) {
+        neighbors_buf[j] = neighbors[j];
+      }
+      write_search_result(index.search_result_file + "." + to_string(i),
+                          dataset->name(),
+                          dataset->distance(),
+                          index.name,
+                          index.algo,
+                          index.build_param.dump(),
+                          index.search_params[i].dump(),
+                          batch_size,
+                          index.run_count,
+                          k,
+                          best_search_time_average,
+                          best_search_time_p99,
+                          best_search_time_p999,
+                          neighbors_buf,
+                          query_set_size);
+    }
+
+    log_info("finish searching for index '%s'", index.name.c_str());
+  }
+
+  delete[] neighbors;
+  delete[] neighbors_buf;
+  delete[] distances;
+  ANN_CUDA_CHECK(cudaFree(d_neighbors));
+  ANN_CUDA_CHECK(cudaFree(d_distances));
+  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+const string usage(const string& argv0)
+{
+  return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
+         "   -b: build mode, will build index\n" +
+         "   -s: search mode, will search using built index\n" +
+         "       one and only one of -b and -s should be specified\n" +
+         "   -c: just check command line options and conf.json are sensible\n" +
+         "       won't build or search\n" + "   -f: force overwriting existing output files\n" +
+         "   -i: by default will build/search all the indices found in conf.json\n" +
+         "       '-i' can be used to select a subset of indices\n" +
+         "       'index_names' is a list of comma-separated index names\n" +
+         "       '*' is allowed as the last character of a name to select all matched indices\n" +
+         "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
+}
+
+int main(int argc, char** argv)
+{
+  bool force_overwrite = false;
+  bool build_mode      = false;
+  bool search_mode     = false;
+  bool only_check      = false;
+  string index_patterns("*");
+
+  int opt;
+  while ((opt = getopt(argc, argv, "bscfi:h")) != -1) {
+    switch (opt) {
+      case 'b': build_mode = true; break;
+      case 's': search_mode = true; break;
+      case 'c': only_check = true; break;
+      case 'f': force_overwrite = true; break;
+      case 'i': index_patterns = optarg; break;
+      case 'h': cout << usage(argv[0]) << endl; return -1;
+      default: cerr << "\n" << usage(argv[0]) << endl; return -1;
+    }
+  }
+  if (build_mode == search_mode) {
+    cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
+    return -1;
+  }
+  if (argc - optind != 1) {
+    cerr << usage(argv[0]) << endl;
+    return -1;
+  }
+  string conf_file = argv[optind];
+
+  std::ifstream conf_stream(conf_file.c_str());
+  if (!conf_stream) {
+    log_error("can't open configuration file: %s", argv[optind]);
+    return -1;
+  }
+
+  try {
+    Configuration conf(conf_stream);
+
+    auto dataset_conf = conf.get_dataset_conf();
+    BinDataset<data_t> dataset(dataset_conf.name,
+                               dataset_conf.base_file,
+                               dataset_conf.subset_first_row,
+                               dataset_conf.subset_size,
+                               dataset_conf.query_file,
+                               dataset_conf.distance);
+
+    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
+    if (!check(indices, build_mode, force_overwrite)) { return -1; }
+
+    string message = "will ";
+    message += build_mode ? "build:" : "search:";
+    for (const auto& index : indices) {
+      message += "\n  " + index.name;
+    }
+    log_info("%s", message.c_str());
+
+    if (only_check) {
+      log_info("%s", "all check passed, quit due to option -c");
+      return 0;
+    }
+
+    if (build_mode) {
+      build(&dataset, indices);
+    } else if (search_mode) {
+      search(&dataset, indices);
+    }
+  } catch (const std::exception& e) {
+    log_error("exception occurs: %s", e.what());
+    return -1;
+  }
+}
diff --git a/cpp/cuann_bench/src/factory.cuh b/cpp/cuann_bench/src/factory.cuh
index f708d2d4d8..03a5e176d6 100644
--- a/cpp/cuann_bench/src/factory.cuh
+++ b/cpp/cuann_bench/src/factory.cuh
@@ -32,9 +32,6 @@
 #ifdef RAFT_CUANN_BENCH_USE_GGNN
 #include "ggnn_wrapper.cuh"
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
-#include "hnswlib_wrapper.h"
-#endif
 #ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
 #include "raft_wrapper.h"
 #endif
@@ -67,23 +64,6 @@ cuann::Metric parse_metric(const std::string& metric_str)
   }
 }
 
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
-template <typename T>
-void parse_build_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::BuildParam& param)
-{
-  param.ef_construction = conf.at("efConstruction");
-  param.M               = conf.at("M");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
-}
-
-template <typename T>
-void parse_search_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::SearchParam& param)
-{
-  param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
-}
-#endif
-
 #ifdef RAFT_CUANN_BENCH_USE_FAISS
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
@@ -275,9 +255,6 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
   std::unique_ptr<cuann::ANN<T>> ann;
 
   if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
-    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
-#endif
 #ifdef RAFT_CUANN_BENCH_USE_FAISS
     if (algo == "faiss_gpu_ivf_flat") {
       ann = make_algo<T, cuann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
@@ -294,11 +271,7 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
 #endif
   }
 
-  if constexpr (std::is_same_v<T, uint8_t>) {
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
-    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
-#endif
-  }
+  if constexpr (std::is_same_v<T, uint8_t>) {}
 
 #ifdef RAFT_CUANN_BENCH_USE_GGNN
   if (algo == "ggnn") { ann = make_algo<T, cuann::Ggnn>(metric, dim, conf); }
@@ -327,13 +300,6 @@ template <typename T>
 std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
   const std::string& algo, const nlohmann::json& conf)
 {
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
-  if (algo == "hnswlib") {
-    auto param = std::make_unique<typename cuann::HnswLib<T>::SearchParam>();
-    parse_search_param<T>(conf, *param);
-    return param;
-  }
-#endif
 #ifdef RAFT_CUANN_BENCH_USE_FAISS
   if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
     auto param = std::make_unique<typename cuann::FaissGpu<T>::SearchParam>();
diff --git a/cpp/cuann_bench/src/factory.h b/cpp/cuann_bench/src/factory.h
new file mode 100644
index 0000000000..30d146d1b1
--- /dev/null
+++ b/cpp/cuann_bench/src/factory.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FACTORY_H_
+#define FACTORY_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "ann.h"
+#undef WARP_SIZE
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#include "hnswlib_wrapper.h"
+#endif
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace benchmark {
+
+cuann::Metric parse_metric(const std::string& metric_str)
+{
+  if (metric_str == "inner_product") {
+    return cuann::Metric::kInnerProduct;
+  } else if (metric_str == "euclidean") {
+    return cuann::Metric::kEuclidean;
+  } else {
+    throw std::runtime_error("invalid metric: '" + metric_str + "'");
+  }
+}
+
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+template <typename T>
+void parse_build_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::BuildParam& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.M               = conf.at("M");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+#endif
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric, int dim, const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric,
+                                         int dim,
+                                         const nlohmann::json& conf,
+                                         const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
+                                           const std::string& distance,
+                                           int dim,
+                                           float refine_ratio,
+                                           const nlohmann::json& conf,
+                                           const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  cuann::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
+#endif
+  }
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+  if (algo == "hnswlib") {
+    auto param = std::make_unique<typename cuann::HnswLib<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+#endif
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace benchmark
+#endif

From e7aca5932df8280fc139448e38bc76b999fba094 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Feb 2023 23:49:11 -0500
Subject: [PATCH 03/39] More cleanup

---
 build.sh                                     |  29 +-
 ci/docs/build.sh                             |   2 +-
 cpp/cuann_bench/CMakeLists.txt               |   2 +-
 cpp/cuann_bench/README.md                    |   8 +-
 cpp/cuann_bench/conf/bigann-100M.json        | 168 ----
 cpp/cuann_bench/conf/bigann-1B.json          | 139 ----
 cpp/cuann_bench/conf/deep-100M.fp16.json     |  50 --
 cpp/cuann_bench/conf/deep-100M.json          | 297 -------
 cpp/cuann_bench/conf/deep-1B.json            | 266 ------
 cpp/cuann_bench/conf/glove-100-inner.json    | 709 ----------------
 cpp/cuann_bench/conf/sift-128-euclidean.json | 804 +------------------
 11 files changed, 42 insertions(+), 2432 deletions(-)
 delete mode 100644 cpp/cuann_bench/conf/bigann-1B.json
 delete mode 100644 cpp/cuann_bench/conf/deep-100M.fp16.json

diff --git a/build.sh b/build.sh
index 39f2d6e122..107d1102a4 100755
--- a/build.sh
+++ b/build.sh
@@ -45,6 +45,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
                                  can be useful for a pure header-only install
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
    --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. NEIGHBORS_BENCH;CLUSTER_BENCH)
+   --limit-cuann-bench         - semicolon-separated list of cuann benchmark executables to compute (e.g. CUANN_BENCH_HNSWLIB;CUANN_BENCH_RAFT_IVF_PQ)
    --allgpuarch                - build for all supported GPU architectures
    --buildfaiss                - build faiss statically into raft
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
@@ -176,6 +177,21 @@ function limitBench {
     fi
 }
 
+function limitCuannBench {
+    # Check for option to limit the set of test binaries to build
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-cuann-bench" || true; } ) ]]; then
+        # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
+        # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
+        # on the invalid option error
+        LIMIT_CUANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-cuann-bench=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_CUANN_BENCH_TARGETS} ]]; then
+            # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//--limit-cuann-bench=$LIMIT_CUANN_BENCH_TARGETS/}
+            CUANN_BENCH_TARGETS=${LIMIT_CUANN_BENCH_TARGETS}
+        fi
+    fi
+}
+
 if hasArg -h || hasArg --help; then
     echo "${HELP}"
     exit 0
@@ -187,6 +203,7 @@ if (( ${NUMARGS} != 0 )); then
     cacheTool
     limitTests
     limitBench
+    limitCuannBench
     for a in ${ARGS}; do
         if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
             echo "Invalid option: ${a}"
@@ -340,10 +357,14 @@ fi
 
 if hasArg cuann_bench || (( ${NUMARGS} == 0 )); then
     BUILD_CUANN_BENCH=ON
-    CMAKE_TARGET="${CMAKE_TARGET};CUANN_BENCH_HNSWLIB"
-    ENABLE_NN_DEPENDENCIES=ON
-    COMPILE_NN_LIBRARY=ON
-    COMPILE_DIST_LIBRARY=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${CUANN_BENCH_TARGETS}"
+
+    # Force compile nn library when needed benchmark targets are specified
+    if [[ $CMAKE_TARGET == *"_RAFT_"* ]]; then
+      ENABLE_NN_DEPENDENCIES=ON
+      COMPILE_DIST_LIBRARY=ON
+      COMPILE_NN_LIBRARY=ON
+    fi
 fi
 
 if hasArg --buildfaiss; then
diff --git a/ci/docs/build.sh b/ci/docs/build.sh
index 7ac7059ef2..e3062107c0 100644
--- a/ci/docs/build.sh
+++ b/ci/docs/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #################################
 # RAFT docs build script for CI #
 #################################
diff --git a/cpp/cuann_bench/CMakeLists.txt b/cpp/cuann_bench/CMakeLists.txt
index 1f5e79038b..9f26afaac6 100644
--- a/cpp/cuann_bench/CMakeLists.txt
+++ b/cpp/cuann_bench/CMakeLists.txt
@@ -110,7 +110,7 @@ function(ConfigureCuannBench)
     target_compile_definitions(
       ${BENCH_NAME}
       PUBLIC
-        RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME}=${RAFT_CUANN_BENCH_USE_}${ConfigureCuannBench_NAME}
+        RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME}=RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME}
     )
   endif()
 
diff --git a/cpp/cuann_bench/README.md b/cpp/cuann_bench/README.md
index 310f31b95b..5ed1c187b1 100644
--- a/cpp/cuann_bench/README.md
+++ b/cpp/cuann_bench/README.md
@@ -24,7 +24,7 @@ Most of the libraries are optional, and they can be enabled by the CUANN_USE_XYZ
 #### installing NCCL
 If `CUANN_USE_MULTI_GPU = 1` in `benchmark/Makefile`, NCCL is required.
 
-It's most convenient to install NCCL under `${cuann_path}/third_party/nccl/`, like using "O/S agnostic local installer" downloaded from https://developer.nvidia.com/nccl/nccl-download. Otherwise, may need to modify `CPPFLAGS` and `LDFLAGS` in `benchmark/Makefile` to add include and library paths.
+It's most convenient to install NCCL under `${cuann_bench_path}/third_party/nccl/`, like using "O/S agnostic local installer" downloaded from https://developer.nvidia.com/nccl/nccl-download. Otherwise, may need to modify `CPPFLAGS` and `LDFLAGS` in `benchmark/Makefile` to add include and library paths.
 
 
 #### installing FAISS library
@@ -35,7 +35,7 @@ FAISS can be installed in many ways:
 
 For manual installation: need to install FAISS from source. See [Building from source](https://github.com/facebookresearch/faiss/blob/master/INSTALL.md#building-from-source) for detailed steps.
 
-It's most convenient to install FAISS under `${cuann_path}/third_party/faiss/`. Otherwise, may need to modify `FAISS_PATH` in `benchmark/Makefile`.
+It's most convenient to install FAISS under `${cuann_bench_path}/third_party/faiss/`. Otherwise, may need to modify `FAISS_PATH` in `benchmark/Makefile`.
 
 An example of cmake build commands:
 ```
@@ -44,7 +44,7 @@ cmake -DFAISS_ENABLE_GPU=ON \
   -DFAISS_ENABLE_PYTHON=OFF -DBUILD_TESTING=OFF \
   -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CUDA_ARCHITECTURES="70;75;80;86" \
-  -DCMAKE_INSTALL_PREFIX=${cuann_path}/third_party/faiss ..
+  -DCMAKE_INSTALL_PREFIX=${cuann_bench_path}/third_party/faiss ..
 ```
 
 
@@ -254,7 +254,7 @@ It saves recall value in result txt file, so avoids to recompute recall if the s
 
 
 ## How to add a new ANN algorithm
-Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `include/cuann/ann.h`) and implements all the pure virtual functions.
+Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `src/cuann_bench/ann.h`) and implements all the pure virtual functions.
 
 In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
 ```
diff --git a/cpp/cuann_bench/conf/bigann-100M.json b/cpp/cuann_bench/conf/bigann-100M.json
index d6c3a12f51..5f16f3378d 100644
--- a/cpp/cuann_bench/conf/bigann-100M.json
+++ b/cpp/cuann_bench/conf/bigann-100M.json
@@ -169,174 +169,6 @@
     },
 
 
-    {
-      "name" : "libcuann.dimpq48-cluster50K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 48,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq48-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq48-cluster50K.refine2"
-    },
-
-
-    {
-      "name" : "libcuann.dimpq64-cluster50K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 64,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq64-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq64-cluster50K.refine2"
-    },
-
-    {
-      "name" : "libcuann.dimpq64-5bit-cluster50K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 64,
-        "bitPq" : 5,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq64-5bit-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq64-5bit-cluster50K.refine4"
-    },
-
-    {
-      "name" : "libcuann.dimpq72-cluster50K",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 72,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq72-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq72-cluster50K"
-    },
-
-    {
-      "name" : "libcuann.dimpq72-cluster50K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 72,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq72-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq72-cluster50K.refine2"
-    },
-
-    {
-      "name" : "libcuann.dimpq96-cluster50K",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 96,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq96-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq96-cluster50K"
-    },
-
-    {
-      "name" : "libcuann.dimpq96-cluster50K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 96,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-100M/libcuann/dimpq96-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-100M/libcuann/dimpq96-cluster50K.refine2"
-    }
-
 
   ]
 }
diff --git a/cpp/cuann_bench/conf/bigann-1B.json b/cpp/cuann_bench/conf/bigann-1B.json
deleted file mode 100644
index 3f9c8e4457..0000000000
--- a/cpp/cuann_bench/conf/bigann-1B.json
+++ /dev/null
@@ -1,139 +0,0 @@
-{
-  "dataset" : {
-    "name" : "bigann-1B",
-    "base_file" : "data/bigann-1B/base.1B.u8bin",
-    "query_file" : "data/bigann-1B/query.public.10K.u8bin",
-    "distance" : "euclidean"
-  },
-
-  "search_basic_param" : {
-    "batch_size" : 10000,
-    "k" : 10,
-    "run_count" : 2
-  },
-
-  "index" : [
-    {
-      "name" : "libcuann.dimpq32-cluster100K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 32,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-1B/libcuann/dimpq32-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-1B/libcuann/dimpq32-cluster100K.refine4"
-    },
-
-    {
-      "name" : "libcuann.dimpq48-cluster100K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 48,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-1B/libcuann/dimpq48-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-1B/libcuann/dimpq48-cluster100K.refine2"
-    },
-
-    {
-      "name" : "libcuann.dimpq64-cluster100K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 64,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-1B/libcuann/dimpq64-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-1B/libcuann/dimpq64-cluster100K.refine2"
-    },
-
-    {
-      "name" : "libcuann.dimpq64-5bit-cluster100K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 64,
-        "bitPq" : 5,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-1B/libcuann/dimpq64-5bit-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-1B/libcuann/dimpq64-5bit-cluster100K.refine4"
-    },
-
-    {
-      "name" : "libcuann.dimpq64-5bit-cluster250K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 250000,
-        "dimPq" : 64,
-        "bitPq" : 5,
-        "randomRotation" : true
-      },
-      "file" : "index/bigann-1B/libcuann/dimpq64-5bit-cluster250K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/bigann-1B/libcuann/dimpq64-5bit-cluster250K.refine4"
-    }
-
-  ]
-}
diff --git a/cpp/cuann_bench/conf/deep-100M.fp16.json b/cpp/cuann_bench/conf/deep-100M.fp16.json
deleted file mode 100644
index 18fb75e8e9..0000000000
--- a/cpp/cuann_bench/conf/deep-100M.fp16.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
-  "dataset" : {
-    "name" : "deep-100M-fp16",
-    "base_file" : "data/deep-1B/base.1B.f16bin",
-    "subset_size" : 100000000,
-    "query_file" : "data/deep-1B/query.public.10K.f16bin",
-    // although distance should be "euclidean", faiss becomes much slower for that
-    "distance" : "inner_product"
-  },
-
-  "search_basic_param" : {
-    "batch_size" : 10000,
-    "k" : 10,
-    "run_count" : 2
-  },
-
-  "index" : [
-    {
-      "name" : "cagra-fp16.k64",
-      "algo" : "cagra",
-      "build_param": {
-      },
-      "file" : "index/deep-100M/cagra/k96.pruned.k64",
-      "search_params": [
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 32 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 64 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 96 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 128 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 160 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 192 }
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":30 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":40 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":50 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":60 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":30 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":40 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":50 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":60 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 96 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 128 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 160 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 192 }
-      ],
-      "search_result_file" : "result/deep-100M/cagra-fp16/k64"
-    }
-
-  ]
-}
diff --git a/cpp/cuann_bench/conf/deep-100M.json b/cpp/cuann_bench/conf/deep-100M.json
index af699c1f50..36b42aba92 100644
--- a/cpp/cuann_bench/conf/deep-100M.json
+++ b/cpp/cuann_bench/conf/deep-100M.json
@@ -91,102 +91,6 @@
       ],
       "search_result_file" : "result/deep-100M/hnswlib/M36"
     },
-
-
-    {
-      "name" : "cugann.K64",
-      "algo" : "cugann",
-      "build_param": {
-        "K": 64,
-        "build_mode": "fast",
-        "max_edge_num": 160,
-        "rank_threshold": 6,
-        "long_edge_threshold": 1.3
-      },
-      "file" : "index/deep-100M/cugann/K64",
-      "search_params": [
-        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 7 }
-      ],
-      "search_result_file" : "result/deep-100M/cugann/K64"
-    },
-    {
-      "name" : "cugann.K64-bulk",
-      "algo" : "cugann",
-      "build_param": {
-        "K": 64,
-        "build_mode": "fast",
-        "max_edge_num": 160,
-        "rank_threshold": 6,
-        "long_edge_threshold": 1.3
-      },
-      "file" : "index/deep-100M/cugann/K64",
-      "search_params": [
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 24 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 32 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 38 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 48 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 54 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 64 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 76 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 84 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 90 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 96 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 104 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 110 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 120 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 128 }
-      ],
-      "search_result_file" : "result/deep-100M/cugann/K64-bulk"
-    },
-
-
-    {
-      "name" : "cagra.k64",
-      "algo" : "cagra",
-      "build_param": {
-      },
-      "file" : "index/deep-100M/cagra/k96.pruned.k64",
-      "search_params": [
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 32 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 64 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 96 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 128 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 160 },
-        // { "batch_size":1, "k":10, "search_mode": "multi-cta", "internal_k": 192 }
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":30 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":40 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":50 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 32, "max_iterations":60 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":30 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":40 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":50 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 64, "max_iterations":60 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 96 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 128 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 160 },
-        { "batch_size":10000, "k":10, "search_mode": "single-cta", "internal_k": 192 }
-      ],
-      "search_result_file" : "result/deep-100M/cagra/k64"
-    },
-
-
     {
       "name" : "faiss_ivf_flat.nlist50K",
       "algo" : "faiss_gpu_ivf_flat",
@@ -316,206 +220,5 @@
     },
 
 
-    {
-      "name" : "libcuann.dimpq48-cluster50K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 48
-      },
-      "file" : "index/deep-100M/libcuann/dimpq48-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/libcuann/dimpq48-cluster50K.refine2"
-    },
-    {
-      "name" : "libcuann.dimpq48-5bit-cluster50K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 48,
-        "bitPq" : 5
-      },
-      "file" : "index/deep-100M/libcuann/dimpq48-5bit-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/libcuann/dimpq48-5bit-cluster50K.refine4"
-    },
-    {
-      "name" : "libcuann.dimpq64-cluster50K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 64
-      },
-      "file" : "index/deep-100M/libcuann/dimpq64-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/libcuann/dimpq64-cluster50K.refine2"
-    },
-    {
-      "name" : "libcuann.dimpq64-5bit-cluster50K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 64,
-        "bitPq" : 5
-      },
-      "file" : "index/deep-100M/libcuann/dimpq64-5bit-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/libcuann/dimpq64-5bit-cluster50K.refine4"
-    },
-    {
-      "name" : "libcuann.dimpq72-cluster50K",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 72
-      },
-      "file" : "index/deep-100M/libcuann/dimpq72-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/libcuann/dimpq72-cluster50K"
-    },
-    {
-      "name" : "libcuann.dimpq96-cluster50K",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 100000000,
-        "numClusters" : 50000,
-        "dimPq" : 96
-      },
-      "file" : "index/deep-100M/libcuann/dimpq96-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/libcuann/dimpq96-cluster50K"
-    },
-
-
-    // the following multigpu configurations are for validating correctness, not for measuring performance
-    {
-      "name" : "multigpu.faiss_ivf_flat.nlist10K",
-      "algo" : "faiss_gpu_ivf_flat",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "build_param": {"nlist":10000},
-      "file" : "index/deep-100M/multigpu/faiss_ivf_flat.nlist10K",
-      "search_params" : [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ],
-      "search_result_file" : "result/deep-100M/multigpu/faiss_ivf_flat.nlist10K"
-    },
-
-    {
-      "name" : "multigpu.libcuann.dimpq72-cluster10K",
-      "algo" : "libcuann",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "build_param": {
-        "numDataset" : 12500000,
-        "numClusters" : 10000,
-        "dimPq" : 72
-      },
-      "file" : "index/deep-100M/multigpu/libcuann.dimpq72-cluster10K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/multigpu/libcuann.dimpq72-cluster10K"
-    },
-
-    {
-      "name" : "multigpu.libcuann.dimpq72-cluster10K.refine2",
-      "algo" : "libcuann",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 12500000,
-        "numClusters" : 10000,
-        "dimPq" : 72
-      },
-      "file" : "index/deep-100M/multigpu/libcuann.dimpq72-cluster10K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-100M/multigpu/libcuann.dimpq72-cluster10K.refine2"
-    }
-
   ]
 }
diff --git a/cpp/cuann_bench/conf/deep-1B.json b/cpp/cuann_bench/conf/deep-1B.json
index 8218ef30bd..50d1b87602 100644
--- a/cpp/cuann_bench/conf/deep-1B.json
+++ b/cpp/cuann_bench/conf/deep-1B.json
@@ -34,271 +34,5 @@
     },
 
 
-    {
-      "name" : "libcuann.dimpq48-cluster100K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 48
-      },
-      "file" : "index/deep-1B/libcuann/dimpq48-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/libcuann/dimpq48-cluster100K.refine2"
-    },
-    {
-      "name" : "libcuann.dimpq48-5bit-cluster100K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 48,
-        "bitPq" : 5
-      },
-      "file" : "index/deep-1B/libcuann/dimpq48-5bit-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/libcuann/dimpq48-5bit-cluster100K.refine4"
-    },
-    {
-      "name" : "libcuann.dimpq64-cluster100K.refine2",
-      "algo" : "libcuann",
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 64
-      },
-      "file" : "index/deep-1B/libcuann/dimpq64-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/libcuann/dimpq64-cluster100K.refine2"
-    },
-    {
-      "name" : "libcuann.dimpq64-5bit-cluster100K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 64,
-        "bitPq" : 5
-      },
-      "file" : "index/deep-1B/libcuann/dimpq64-5bit-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/libcuann/dimpq64-5bit-cluster100K.refine4"
-    },
-    {
-      "name" : "libcuann.dimpq64-5bit-cluster250K.refine4",
-      "algo" : "libcuann",
-      "refine_ratio" : 4,
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 250000,
-        "dimPq" : 64,
-        "bitPq" : 5
-      },
-      "file" : "index/deep-1B/libcuann/dimpq64-5bit-cluster250K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/libcuann/dimpq64-5bit-cluster250K.refine4"
-    },
-
-    {
-      "name" : "libcuann.dimpq72-cluster100K",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1000000000,
-        "numClusters" : 100000,
-        "dimPq" : 72
-      },
-      "file" : "index/deep-1B/libcuann/dimpq72-cluster100K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/libcuann/dimpq72-cluster100K"
-    },
-    // libcuann.dimpq96-cluster100K: index size 94GB, >80GB, so becomes slow
-
-
-    {
-      "name" : "multigpu.faiss_ivf_flat.nlist50K",
-      "algo" : "faiss_gpu_ivf_flat",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "build_param": {"nlist":50000},
-      "file" : "index/deep-1B/multigpu/faiss_ivf_flat.nlist50K",
-      "search_params" : [
-        {"nprobe":20},
-        {"nprobe":30},
-        {"nprobe":40},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ],
-      "search_result_file" : "result/deep-1B/multigpu/faiss_ivf_flat.nlist50K"
-    },
-
-    {
-      "name" : "multigpu.libcuann.dimpq48-cluster50K.refine2",
-      "algo" : "libcuann",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "refine_ratio" : 2,
-      "build_param": {
-        "numDataset" : 125000000,
-        "numClusters" : 50000,
-        "dimPq" : 48
-      },
-      "file" : "index/deep-1B/multigpu/libcuann.dimpq48-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/multigpu/libcuann.dimpq48-cluster50K.refine2"
-    },
-    {
-      "name" : "multigpu.libcuann.dimpq96-cluster50K",
-      "algo" : "libcuann",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "build_param": {
-        "numDataset" : 125000000,
-        "numClusters" : 50000,
-        "dimPq" : 96
-      },
-      "file" : "index/deep-1B/multigpu/libcuann.dimpq96-cluster50K",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":20},
-        {"max_batch_size":10000, "k":10, "numProbes":30},
-        {"max_batch_size":10000, "k":10, "numProbes":40},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/deep-1B/multigpu/libcuann.dimpq96-cluster50K"
-    },
-
-    {
-      "name" : "multigpu.cugann.K64",
-      "algo" : "cugann",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "build_param": {
-        "K": 64,
-        "build_mode": "fast",
-        "max_edge_num": 160,
-        "rank_threshold": 6,
-        "long_edge_threshold": 1.3
-      },
-      "file" : "index/deep-1B/multigpu/cugann.K64",
-      "search_params": [
-        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num":  50, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num":  64, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num":  96, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 144, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 160, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 176, "iteration_num": 7 },
-        { "search_mode": "fast", "searcher_num": 192, "iteration_num": 7 }
-      ],
-      "search_result_file" : "result/deep-1B/multigpu/cugann.K64"
-    },
-    {
-      "name" : "multigpu.cugann.K64-bulk",
-      "algo" : "cugann",
-      "multigpu" : [0,1,2,3,4,5,6,7],
-      "build_param": {
-        "K": 64,
-        "build_mode": "fast",
-        "max_edge_num": 160,
-        "rank_threshold": 6,
-        "long_edge_threshold": 1.3
-      },
-      "file" : "index/deep-1B/multigpu/cugann.K64",
-      "search_params": [
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 24 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 32, "iteration_num": 32 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 38 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 48 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 54 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 64, "iteration_num": 64 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 76 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 84 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 90 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 96, "iteration_num": 96 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 104 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 110 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 120 },
-        { "search_mode": "bulk", "searcher_num": 1, "searcher_k": 128, "iteration_num": 128 }
-      ],
-      "search_result_file" : "result/deep-1B/multigpu/cugann.K64-bulk"
-    }
-
   ]
 }
diff --git a/cpp/cuann_bench/conf/glove-100-inner.json b/cpp/cuann_bench/conf/glove-100-inner.json
index 887190683f..d210aca654 100644
--- a/cpp/cuann_bench/conf/glove-100-inner.json
+++ b/cpp/cuann_bench/conf/glove-100-inner.json
@@ -184,226 +184,6 @@
       "search_result_file" : "result/glove-100-inner/hnswlib/M96"
     },
 
-
-    {
-      "name" : "cuhnsw.M4",
-      "algo" : "cuhnsw",
-      "build_param": {"M":4, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M4",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M4"
-    },
-
-    {
-      "name" : "cuhnsw.M8",
-      "algo" : "cuhnsw",
-      "build_param": {"M":8, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M8",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M8"
-    },
-
-    {
-      "name" : "cuhnsw.M12",
-      "algo" : "cuhnsw",
-      "build_param": {"M":12, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M12",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M12"
-    },
-
-    {
-      "name" : "cuhnsw.M16",
-      "algo" : "cuhnsw",
-      "build_param": {"M":16, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M16",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M16"
-    },
-
-    {
-      "name" : "cuhnsw.M24",
-      "algo" : "cuhnsw",
-      "build_param": {"M":24, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M24",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M24"
-    },
-
-    {
-      "name" : "cuhnsw.M36",
-      "algo" : "cuhnsw",
-      "build_param": {"M":36, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M36",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M36"
-    },
-
-    {
-      "name" : "cuhnsw.M48",
-      "algo" : "cuhnsw",
-      "build_param": {"M":48, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M48",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M48"
-    },
-
-    {
-      "name" : "cuhnsw.M64",
-      "algo" : "cuhnsw",
-      "build_param": {"M":64, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M64",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M64"
-    },
-
-    {
-      "name" : "cuhnsw.M96",
-      "algo" : "cuhnsw",
-      "build_param": {"M":96, "efConstruction":500, "block_dim":64},
-      "file" : "index/glove-100-inner/cuhnsw/M96",
-      "search_params" : [
-        {"ef":10},
-        {"ef":20},
-        {"ef":40},
-        {"ef":80},
-        {"ef":120},
-        {"ef":200},
-        {"ef":400},
-        {"ef":600},
-        {"ef":800}
-      ],
-      "search_result_file" : "result/glove-100-inner/cuhnsw/M96"
-    },
-
-    {
-      "name" : "cugann.K224",
-      "algo" : "cugann",
-      "build_param" : {
-        "K": 224,
-        "build_mode": "fast",
-        "max_edge_num": 160,
-        "rank_threshold": 6,
-        "long_edge_threshold": 1.3
-      },
-      "file" : "index/glove-100-inner/cugann/K224",
-      "search_params": [
-        { "search_mode": "fast", "searcher_num": 8,  "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 10, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 14, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 16, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 20, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 26, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 32, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 36, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 40, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 44, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 48, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 52, "iteration_num": 4 },
-        { "search_mode": "fast", "searcher_num": 8,  "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 10, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 14, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 16, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 20, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 26, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 32, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 36, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 40, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 44, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 48, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 52, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 56, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 60, "iteration_num": 5 },
-        { "search_mode": "fast", "searcher_num": 32, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 34, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 50, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 64, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 96, "iteration_num": 6 },
-        { "search_mode": "fast", "searcher_num": 128, "iteration_num": 6 }
-      ],
-      "search_result_file" : "result/glove-100-inner/cugann/K224"
-    },
-
     {
       "name" : "faiss_ivf_flat.nlist1024",
       "algo" : "faiss_gpu_ivf_flat",
@@ -1012,495 +792,6 @@
     },
 
 
-    {
-      "name" : "libcuann.dimpq10-cluster1024",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 1024,
-        "dimPq" : 10
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster1024",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster1024"
-    },
-
-    {
-      "name" : "libcuann.dimpq10-cluster2048",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 2048,
-        "dimPq" : 10
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster2048",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster2048"
-    },
-
-    {
-      "name" : "libcuann.dimpq10-cluster4096",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 4096,
-        "dimPq" : 10
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster4096",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster4096"
-    },
-
-    {
-      "name" : "libcuann.dimpq10-cluster8192",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 8192,
-        "dimPq" : 10
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster8192",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster8192"
-    },
-
-    {
-      "name" : "libcuann.dimpq10-cluster16384",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 16384,
-        "dimPq" : 10
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq10-cluster16384",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000},
-        {"max_batch_size":10000, "k":10, "numProbes":2000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq10-cluster16384"
-    },
-
-    {
-      "name" : "libcuann.dimpq20-cluster1024",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 1024,
-        "dimPq" : 20
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster1024",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster1024"
-    },
-
-    {
-      "name" : "libcuann.dimpq20-cluster2048",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 2048,
-        "dimPq" : 20
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster2048",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster2048"
-    },
-
-    {
-      "name" : "libcuann.dimpq20-cluster4096",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 4096,
-        "dimPq" : 20
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster4096",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster4096"
-    },
-
-    {
-      "name" : "libcuann.dimpq20-cluster8192",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 8192,
-        "dimPq" : 20
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster8192",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster8192"
-    },
-
-    {
-      "name" : "libcuann.dimpq20-cluster16384",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 16384,
-        "dimPq" : 20
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq20-cluster16384",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000},
-        {"max_batch_size":10000, "k":10, "numProbes":2000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq20-cluster16384"
-    },
-
-    {
-      "name" : "libcuann.dimpq50-cluster1024",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 1024,
-        "dimPq" : 50
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster1024",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster1024"
-    },
-
-    {
-      "name" : "libcuann.dimpq50-cluster2048",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 2048,
-        "dimPq" : 50
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster2048",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster2048"
-    },
-
-    {
-      "name" : "libcuann.dimpq50-cluster4096",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 4096,
-        "dimPq" : 50
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster4096",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster4096"
-    },
-
-    {
-      "name" : "libcuann.dimpq50-cluster8192",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 8192,
-        "dimPq" : 50
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster8192",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster8192"
-    },
-
-    {
-      "name" : "libcuann.dimpq50-cluster16384",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 16384,
-        "dimPq" : 50
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq50-cluster16384",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000},
-        {"max_batch_size":10000, "k":10, "numProbes":2000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq50-cluster16384"
-    },
-
-    {
-      "name" : "libcuann.dimpq100-cluster1024",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 1024,
-        "dimPq" : 100
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster1024",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster1024"
-    },
-
-    {
-      "name" : "libcuann.dimpq100-cluster2048",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 2048,
-        "dimPq" : 100
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster2048",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster2048"
-    },
-
-    {
-      "name" : "libcuann.dimpq100-cluster4096",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 4096,
-        "dimPq" : 100
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster4096",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster4096"
-    },
-
-    {
-      "name" : "libcuann.dimpq100-cluster8192",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 8192,
-        "dimPq" : 100
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster8192",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster8192"
-    },
-
-    {
-      "name" : "libcuann.dimpq100-cluster16384",
-      "algo" : "libcuann",
-      "build_param": {
-        "numDataset" : 1183514,
-        "numClusters" : 16384,
-        "dimPq" : 100
-      },
-      "file" : "index/glove-100-inner/libcuann/dimpq100-cluster16384",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000},
-        {"max_batch_size":10000, "k":10, "numProbes":2000}
-      ],
-      "search_result_file" : "result/glove-100-inner/libcuann/dimpq100-cluster16384"
-    },
-
-    {
-      "name" : "ivf_flat.nlist1024",
-      "algo" : "ivf_flat",
-      "build_param": {
-        "nlist":1024,
-        "ratio":2,
-        "niter":20
-      },
-      "file" : "index/glove-100-inner/ivf_flat/nlist1024",
-      "search_params" : [
-        {"max_batch":10000, "max_k":10, "nprobe":1},
-        {"max_batch":10000, "max_k":10, "nprobe":5},
-        {"max_batch":10000, "max_k":10, "nprobe":10},
-        {"max_batch":10000, "max_k":10, "nprobe":50},
-        {"max_batch":10000, "max_k":10, "nprobe":100},
-        {"max_batch":10000, "max_k":10, "nprobe":200},
-        {"max_batch":10000, "max_k":10, "nprobe":500},
-        {"max_batch":10000, "max_k":10, "nprobe":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/ivf_flat/nlist1024"
-    },
-
-
-    {
-      "name" : "multigpu-libcuann.dimpq100-cluster1024",
-      "algo" : "libcuann",
-      "multigpu" : [0, 1], 
-      "build_param": {
-        "numDataset" : 591757,
-        "numClusters" : 1024,
-        "dimPq" : 100
-      },
-      "file" : "index/glove-100-inner/multigpu/libcuann/dimpq100-cluster1024",
-      "search_params" : [
-        {"max_batch_size":10000, "k":10, "numProbes":1},
-        {"max_batch_size":10000, "k":10, "numProbes":5},
-        {"max_batch_size":10000, "k":10, "numProbes":10},
-        {"max_batch_size":10000, "k":10, "numProbes":50},
-        {"max_batch_size":10000, "k":10, "numProbes":100},
-        {"max_batch_size":10000, "k":10, "numProbes":200},
-        {"max_batch_size":10000, "k":10, "numProbes":500},
-        {"max_batch_size":10000, "k":10, "numProbes":1000}
-      ],
-      "search_result_file" : "result/glove-100-inner/multigpu/libcuann/dimpq100-cluster1024"
-    }
   ]
 
 }
diff --git a/cpp/cuann_bench/conf/sift-128-euclidean.json b/cpp/cuann_bench/conf/sift-128-euclidean.json
index e759d3133e..476c363ecd 100644
--- a/cpp/cuann_bench/conf/sift-128-euclidean.json
+++ b/cpp/cuann_bench/conf/sift-128-euclidean.json
@@ -15,7 +15,7 @@
       "name" : "hnswlib.M12",
       "algo" : "hnswlib",
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
-      "file" : "index/deep-100M/hnswlib/M12",
+      "file" : "index/sift-128-euclidean/hnswlib/M12",
       "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
@@ -28,13 +28,13 @@
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
       ],
-      "search_result_file" : "result/deep-100M/hnswlib/M12"
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M12"
     },
     {
       "name" : "hnswlib.M16",
       "algo" : "hnswlib",
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
-      "file" : "index/deep-100M/hnswlib/M16",
+      "file" : "index/sift-128-euclidean/hnswlib/M16",
       "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
@@ -47,13 +47,13 @@
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
       ],
-      "search_result_file" : "result/deep-100M/hnswlib/M16"
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M16"
     },
     {
       "name" : "hnswlib.M24",
       "algo" : "hnswlib",
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
-      "file" : "index/deep-100M/hnswlib/M24",
+      "file" : "index/sift-128-euclidean/hnswlib/M24",
       "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
@@ -66,13 +66,13 @@
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
       ],
-      "search_result_file" : "result/deep-100M/hnswlib/M24"
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M24"
     },
     {
       "name" : "hnswlib.M36",
       "algo" : "hnswlib",
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
-      "file" : "index/deep-100M/hnswlib/M36",
+      "file" : "index/sift-128-euclidean/hnswlib/M36",
       "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
@@ -85,7 +85,7 @@
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
       ],
-      "search_result_file" : "result/deep-100M/hnswlib/M36"
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M36"
     },
 
 
@@ -721,769 +721,7 @@
       ],
       "search_result_file": "result/sift-128-euclidean/faiss_flat/flat"
     },
-    {
-      "name": "libcuann.dimpq128-cluster1024",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 128,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024"
-    },
-    {
-      "name": "libcuann.dimpq128-cluster1024-prof",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 128,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-float",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-prof"
-    },
-    {
-      "name": "libcuann.dimpq64-cluster1024-float-half",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 64,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-half",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-half"
-    },
-    {
-      "name": "libcuann.dimpq64-cluster1024-float-fp8",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 64,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-fp8",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 5,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq64-cluster1024-float-fp8"
-    },
-    {
-      "name": "libcuann.dimpq32-cluster1024-float-fp8",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 32,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq32-cluster1024-float-fp8",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 5,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq32-cluster1024-float-fp8"
-    },
-    {
-      "name": "libcuann.dimpq16-cluster1024-float-fp8",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 16,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq16-cluster1024-float-fp8",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 5,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq16-cluster1024-float-fp8"
-    },
-    {
-      "name": "libcuann.dimpq128-cluster1024-float-float",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 128,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-float",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 5,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-float"
-    },
-    {
-      "name": "libcuann.dimpq128-cluster1024-float-half",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 128,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-half",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1000,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "half"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-half"
-    },
-    {
-      "name": "libcuann.dimpq128-cluster1024-float-fp8",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 1024,
-        "dimPq": 128,
-        "sampleRatio": 1
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-fp8",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1024,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "fp8"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster1024-float-fp8"
-    },
-    {
-      "name": "libcuann.dimpq128-cluster16384",
-      "algo": "libcuann",
-      "build_param": {
-        "numDataset": 1000000,
-        "numClusters": 16384,
-        "dimPq": 128,
-        "sampleRatio": 2
-      },
-      "file": "index/sift-128-euclidean/libcuann/dimpq128-cluster16384",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 5
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1000
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 2000
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/libcuann/dimpq128-cluster16384"
-    },
-    {
-      "name": "ivf_flat.nlist1024",
-      "algo": "ivf_flat",
-      "build_param": {
-        "nlist": 1024,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/sift-128-euclidean/ivf_flat/nlist1024",
-      "search_params": [
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 1
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 5
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 10
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 50
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 100
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 200
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 500
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 1000
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/ivf_flat/nlist1024"
-    },
-    {
-      "name": "ivf_flat.nlist16384",
-      "algo": "ivf_flat",
-      "build_param": {
-        "nlist": 16384,
-        "ratio": 2,
-        "niter": 20
-      },
-      "file": "index/sift-128-euclidean/ivf_flat/nlist16384",
-      "search_params": [
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 1
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 5
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 10
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 50
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 100
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 200
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 500
-        },
-        {
-          "max_batch": 10000,
-          "max_k": 10,
-          "nprobe": 1000
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/ivf_flat/nlist16384"
-    },
-    {
-      "name": "multigpu-libcuann.dimpq128-cluster1024",
-      "algo": "libcuann",
-      "multigpu": [
-        0,
-        1
-      ],
-      "build_param": {
-        "numDataset": 591757,
-        "numClusters": 1024,
-        "dimPq": 128
-      },
-      "file": "index/sift-128-euclidean/multigpu/libcuann/dimpq128-cluster1024",
-      "search_params": [
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 5
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 10
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 50
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 100
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 200
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 500
-        },
-        {
-          "max_batch_size": 10000,
-          "k": 10,
-          "numProbes": 1000
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/multigpu/libcuann/dimpq128-cluster1024"
-    },
+
     {
       "name": "raft_ivf_pq.dimpq128-cluster1024",
       "algo": "raft_ivf_pq",
@@ -1534,26 +772,6 @@
       ],
       "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024"
     },
-    {
-      "name": "raft_ivf_pq.dimpq128-cluster1024-prof",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 128,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float",
-      "search_params": [
-        {
-          "k": 10,
-          "numProbes": 200,
-          "internalDistanceDtype": "float",
-          "smemLutDtype": "float"
-        }
-      ],
-      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-prof"
-    },
     {
       "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
       "algo": "raft_ivf_pq",
@@ -1781,7 +999,7 @@
         "ratio": 1,
         "niter": 25
       },
-      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-half",
       "search_params": [
         {
           "k": 10,
@@ -1820,7 +1038,7 @@
           "smemLutDtype": "half"
         }
       ],
-      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-half"
     },
     {
       "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",

From f4c2282755f81d9b67b832c8ca97f6a06a503b10 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 25 Feb 2023 11:26:11 -0500
Subject: [PATCH 04/39] Moving `cpp/cuann_bench` to `cpp/bench/ann` and
 `cpp/bench/prims`

---
 build.sh                                      |  70 ++++-----
 conda/recipes/libraft/build_libraft_tests.sh  |   2 +-
 cpp/CMakeLists.txt                            |  25 +--
 cpp/bench/CMakeLists.txt                      | 138 -----------------
 cpp/{cuann_bench => bench/ann}/CMakeLists.txt |  95 +++++-------
 cpp/{cuann_bench => bench/ann}/README.md      |   0
 .../ann}/conf/bigann-100M.json                |   0
 .../ann}/conf/deep-100M.json                  |   0
 .../ann}/conf/deep-1B.json                    |   0
 .../ann}/conf/glove-100-inner.json            |   0
 .../ann}/conf/sift-128-euclidean.json         |   0
 .../ann}/scripts/eval.pl                      |   0
 .../ann}/scripts/fbin_to_f16bin.py            |   0
 .../ann}/scripts/hdf5_to_fbin.py              |   0
 .../ann}/scripts/split_groundtruth.pl         |   0
 cpp/{cuann_bench => bench/ann}/src/ann.h      |   0
 .../ann}/src/benchmark.cpp                    |   0
 .../ann}/src/benchmark.cu                     |   0
 cpp/{cuann_bench => bench/ann}/src/conf.cpp   |   0
 cpp/{cuann_bench => bench/ann}/src/conf.h     |   0
 .../ann}/src/cudart_util.h                    |   0
 cpp/{cuann_bench => bench/ann}/src/dataset.h  |   0
 .../ann}/src/factory.cuh                      |  46 +++---
 cpp/{cuann_bench => bench/ann}/src/factory.h  |  10 +-
 .../ann}/src/faiss_wrapper.h                  |   0
 .../ann}/src/ggnn_wrapper.cuh                 |   0
 .../ann}/src/hnswlib_wrapper.h                |   0
 .../ann}/src/multigpu.cuh                     |   0
 .../ann}/src/raft_cuann_utils.h               |   0
 .../ann}/src/raft_ivf_flat.cu                 |   0
 .../ann}/src/raft_ivf_flat_wrapper.h          |   0
 .../ann}/src/raft_ivf_pq.cu                   |   0
 .../ann}/src/raft_ivf_pq_wrapper.h            |   0
 .../ann}/src/raft_wrapper.h                   |   0
 cpp/{cuann_bench => bench/ann}/src/util.cpp   |   0
 cpp/{cuann_bench => bench/ann}/src/util.h     |   0
 .../ann}/third_party/patches/ggnn.patch       |   0
 .../ann}/third_party/patches/json.patch       |   0
 cpp/bench/prims/CMakeLists.txt                | 142 ++++++++++++++++++
 cpp/bench/{ => prims}/cluster/kmeans.cu       |   2 +-
 .../{ => prims}/cluster/kmeans_balanced.cu    |   2 +-
 cpp/bench/{ => prims}/common/benchmark.hpp    |   0
 .../{ => prims}/distance/distance_common.cuh  |   0
 .../{ => prims}/distance/distance_cosine.cu   |   2 +-
 .../{ => prims}/distance/distance_exp_l2.cu   |   2 +-
 cpp/bench/{ => prims}/distance/distance_l1.cu |   2 +-
 .../{ => prims}/distance/distance_unexp_l2.cu |   2 +-
 cpp/bench/{ => prims}/distance/fused_l2_nn.cu |   2 +-
 cpp/bench/{ => prims}/distance/kernels.cu     |   0
 cpp/bench/{ => prims}/distance/masked_nn.cu   |   0
 cpp/bench/{ => prims}/linalg/add.cu           |   2 +-
 .../{ => prims}/linalg/map_then_reduce.cu     |   2 +-
 .../{ => prims}/linalg/matrix_vector_op.cu    |   2 +-
 cpp/bench/{ => prims}/linalg/norm.cu          |   2 +-
 cpp/bench/{ => prims}/linalg/normalize.cu     |   2 +-
 cpp/bench/{ => prims}/linalg/reduce.cu        |   2 +-
 .../{ => prims}/linalg/reduce_cols_by_key.cu  |   2 +-
 .../{ => prims}/linalg/reduce_rows_by_key.cu  |   2 +-
 cpp/bench/{ => prims}/main.cpp                |   2 +-
 cpp/bench/{ => prims}/matrix/argmin.cu        |   0
 cpp/bench/{ => prims}/matrix/gather.cu        |   0
 cpp/bench/{ => prims}/matrix/select_k.cu      |   0
 cpp/bench/{ => prims}/neighbors/knn.cuh       |   0
 .../knn/brute_force_float_int64_t.cu          |   2 +-
 .../knn/brute_force_float_uint32_t.cu         |   2 +-
 .../neighbors/knn/ivf_flat_float_int64_t.cu   |   2 +-
 .../neighbors/knn/ivf_flat_int8_t_int64_t.cu  |   2 +-
 .../neighbors/knn/ivf_flat_uint8_t_int64_t.cu |   0
 .../neighbors/knn/ivf_pq_float_uint64_t.cu    |   0
 .../neighbors/knn/ivf_pq_int8_t_uint64_t.cu   |   0
 .../neighbors/knn/ivf_pq_uint8_t_uint64_t.cu  |   0
 cpp/bench/{ => prims}/neighbors/refine.cuh    |   0
 .../neighbors/refine_float_uint64_t.cu        |   0
 .../neighbors/refine_uint8_t_uint64_t.cu      |   0
 cpp/bench/{ => prims}/random/make_blobs.cu    |   2 +-
 cpp/bench/{ => prims}/random/permute.cu       |   0
 cpp/bench/{ => prims}/random/rng.cu           |   2 +-
 cpp/bench/{ => prims}/sparse/convert_csr.cu   |   0
 78 files changed, 282 insertions(+), 290 deletions(-)
 delete mode 100644 cpp/bench/CMakeLists.txt
 rename cpp/{cuann_bench => bench/ann}/CMakeLists.txt (52%)
 rename cpp/{cuann_bench => bench/ann}/README.md (100%)
 rename cpp/{cuann_bench => bench/ann}/conf/bigann-100M.json (100%)
 rename cpp/{cuann_bench => bench/ann}/conf/deep-100M.json (100%)
 rename cpp/{cuann_bench => bench/ann}/conf/deep-1B.json (100%)
 rename cpp/{cuann_bench => bench/ann}/conf/glove-100-inner.json (100%)
 rename cpp/{cuann_bench => bench/ann}/conf/sift-128-euclidean.json (100%)
 rename cpp/{cuann_bench => bench/ann}/scripts/eval.pl (100%)
 rename cpp/{cuann_bench => bench/ann}/scripts/fbin_to_f16bin.py (100%)
 rename cpp/{cuann_bench => bench/ann}/scripts/hdf5_to_fbin.py (100%)
 rename cpp/{cuann_bench => bench/ann}/scripts/split_groundtruth.pl (100%)
 rename cpp/{cuann_bench => bench/ann}/src/ann.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/benchmark.cpp (100%)
 rename cpp/{cuann_bench => bench/ann}/src/benchmark.cu (100%)
 rename cpp/{cuann_bench => bench/ann}/src/conf.cpp (100%)
 rename cpp/{cuann_bench => bench/ann}/src/conf.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/cudart_util.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/dataset.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/factory.cuh (92%)
 rename cpp/{cuann_bench => bench/ann}/src/factory.h (95%)
 rename cpp/{cuann_bench => bench/ann}/src/faiss_wrapper.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/ggnn_wrapper.cuh (100%)
 rename cpp/{cuann_bench => bench/ann}/src/hnswlib_wrapper.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/multigpu.cuh (100%)
 rename cpp/{cuann_bench => bench/ann}/src/raft_cuann_utils.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/raft_ivf_flat.cu (100%)
 rename cpp/{cuann_bench => bench/ann}/src/raft_ivf_flat_wrapper.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/raft_ivf_pq.cu (100%)
 rename cpp/{cuann_bench => bench/ann}/src/raft_ivf_pq_wrapper.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/raft_wrapper.h (100%)
 rename cpp/{cuann_bench => bench/ann}/src/util.cpp (100%)
 rename cpp/{cuann_bench => bench/ann}/src/util.h (100%)
 rename cpp/{cuann_bench => bench/ann}/third_party/patches/ggnn.patch (100%)
 rename cpp/{cuann_bench => bench/ann}/third_party/patches/json.patch (100%)
 create mode 100644 cpp/bench/prims/CMakeLists.txt
 rename cpp/bench/{ => prims}/cluster/kmeans.cu (99%)
 rename cpp/bench/{ => prims}/cluster/kmeans_balanced.cu (98%)
 rename cpp/bench/{ => prims}/common/benchmark.hpp (100%)
 rename cpp/bench/{ => prims}/distance/distance_common.cuh (100%)
 rename cpp/bench/{ => prims}/distance/distance_cosine.cu (94%)
 rename cpp/bench/{ => prims}/distance/distance_exp_l2.cu (94%)
 rename cpp/bench/{ => prims}/distance/distance_l1.cu (93%)
 rename cpp/bench/{ => prims}/distance/distance_unexp_l2.cu (94%)
 rename cpp/bench/{ => prims}/distance/fused_l2_nn.cu (99%)
 rename cpp/bench/{ => prims}/distance/kernels.cu (100%)
 rename cpp/bench/{ => prims}/distance/masked_nn.cu (100%)
 rename cpp/bench/{ => prims}/linalg/add.cu (96%)
 rename cpp/bench/{ => prims}/linalg/map_then_reduce.cu (97%)
 rename cpp/bench/{ => prims}/linalg/matrix_vector_op.cu (99%)
 rename cpp/bench/{ => prims}/linalg/norm.cu (98%)
 rename cpp/bench/{ => prims}/linalg/normalize.cu (98%)
 rename cpp/bench/{ => prims}/linalg/reduce.cu (97%)
 rename cpp/bench/{ => prims}/linalg/reduce_cols_by_key.cu (98%)
 rename cpp/bench/{ => prims}/linalg/reduce_rows_by_key.cu (98%)
 rename cpp/bench/{ => prims}/main.cpp (92%)
 rename cpp/bench/{ => prims}/matrix/argmin.cu (100%)
 rename cpp/bench/{ => prims}/matrix/gather.cu (100%)
 rename cpp/bench/{ => prims}/matrix/select_k.cu (100%)
 rename cpp/bench/{ => prims}/neighbors/knn.cuh (100%)
 rename cpp/bench/{ => prims}/neighbors/knn/brute_force_float_int64_t.cu (93%)
 rename cpp/bench/{ => prims}/neighbors/knn/brute_force_float_uint32_t.cu (93%)
 rename cpp/bench/{ => prims}/neighbors/knn/ivf_flat_float_int64_t.cu (93%)
 rename cpp/bench/{ => prims}/neighbors/knn/ivf_flat_int8_t_int64_t.cu (93%)
 rename cpp/bench/{ => prims}/neighbors/knn/ivf_flat_uint8_t_int64_t.cu (100%)
 rename cpp/bench/{ => prims}/neighbors/knn/ivf_pq_float_uint64_t.cu (100%)
 rename cpp/bench/{ => prims}/neighbors/knn/ivf_pq_int8_t_uint64_t.cu (100%)
 rename cpp/bench/{ => prims}/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu (100%)
 rename cpp/bench/{ => prims}/neighbors/refine.cuh (100%)
 rename cpp/bench/{ => prims}/neighbors/refine_float_uint64_t.cu (100%)
 rename cpp/bench/{ => prims}/neighbors/refine_uint8_t_uint64_t.cu (100%)
 rename cpp/bench/{ => prims}/random/make_blobs.cu (98%)
 rename cpp/bench/{ => prims}/random/permute.cu (100%)
 rename cpp/bench/{ => prims}/random/rng.cu (98%)
 rename cpp/bench/{ => prims}/sparse/convert_csr.cu (100%)

diff --git a/build.sh b/build.sh
index 107d1102a4..77dc329043 100755
--- a/build.sh
+++ b/build.sh
@@ -18,8 +18,8 @@ ARGS=$*
 # scripts, and that this scripts resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests bench cuann_bench clean --uninstall  -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
-HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench=<targets>]
+VALIDARGS="clean libraft pylibraft raft-dask docs tests bench-prims bench-ann clean --uninstall  -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
+HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-prims=<targets>] [--limit-bench-ann=<targets>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
@@ -28,8 +28,8 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    raft-dask        - build the raft-dask Python package. this also requires pylibraft.
    docs             - build the documentation
    tests            - build the tests
-   bench            - build the benchmarks
-   cuann_bench      - build cuda ann benchmarks
+   bench-prims      - build micro-benchmarks for primitives
+   bench-ann        - build end-to-end ann benchmarks
 
  and <flag> is:
    -v                          - verbose build mode
@@ -44,8 +44,8 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --minimal-deps              - disables dependencies like thrust so they can be overridden.
                                  can be useful for a pure header-only install
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
-   --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. NEIGHBORS_BENCH;CLUSTER_BENCH)
-   --limit-cuann-bench         - semicolon-separated list of cuann benchmark executables to compute (e.g. CUANN_BENCH_HNSWLIB;CUANN_BENCH_RAFT_IVF_PQ)
+   --limit-bench-prims         - semicolon-separated list of prims benchmark executables to compute (e.g. NEIGHBORS_PRIMS_BENCH;CLUSTER_PRIMS_BENCH)
+   --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
    --buildfaiss                - build faiss statically into raft
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
@@ -70,8 +70,8 @@ VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=OFF
 BUILD_TYPE=Release
-BUILD_BENCH=OFF
-BUILD_CUANN_BENCH=OFF
+BUILD_PRIMS_BENCH=OFF
+BUILD_ANN_BENCH=OFF
 BUILD_STATIC_FAISS=OFF
 COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
@@ -80,7 +80,7 @@ ENABLE_NN_DEPENDENCIES=OFF
 INSTALL_TARGET=install
 
 TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
-BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
+PRIMS_BENCH_TARGETS="CLUSTER_PRIMS_BENCH;NEIGHBORS_PRIMS_BENCH;DISTANCE_PRIMS_BENCH;LINALG_PRIMS_BENCH;MATRIX_PRIMS_BENCH;SPARSE_PRIMS_BENCH;RANDOM_PRIMS_BENCH"
 ENABLE_thrust_DEPENDENCY=ON
 
 CACHE_ARGS=""
@@ -164,30 +164,30 @@ function limitTests {
 
 function limitBench {
     # Check for option to limit the set of test binaries to build
-    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench" || true; } ) ]]; then
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-prims" || true; } ) ]]; then
         # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
         # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
         # on the invalid option error
-        LIMIT_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench=//' -e 's/ .*//')
-        if [[ -n ${LIMIT_BENCH_TARGETS} ]]; then
-            # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
-            ARGS=${ARGS//--limit-bench=$LIMIT_BENCH_TARGETS/}
-            BENCH_TARGETS=${LIMIT_BENCH_TARGETS}
+        LIMIT_PRIMS_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-prims=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_PRIMS_BENCH_TARGETS} ]]; then
+            # Remove the full LIMIT_PRIMS_BENCH_TARGETS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//--limit-bench-prims=$LIMIT_PRIMS_BENCH_TARGETS/}
+            PRIMS_BENCH_TARGETS=${LIMIT_PRIMS_BENCH_TARGETS}
         fi
     fi
 }
 
-function limitCuannBench {
+function limitAnnBench {
     # Check for option to limit the set of test binaries to build
-    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-cuann-bench" || true; } ) ]]; then
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-ann" || true; } ) ]]; then
         # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
         # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
         # on the invalid option error
-        LIMIT_CUANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-cuann-bench=//' -e 's/ .*//')
-        if [[ -n ${LIMIT_CUANN_BENCH_TARGETS} ]]; then
+        LIMIT_ANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-ann=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_ANN_BENCH_TARGETS} ]]; then
             # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
-            ARGS=${ARGS//--limit-cuann-bench=$LIMIT_CUANN_BENCH_TARGETS/}
-            CUANN_BENCH_TARGETS=${LIMIT_CUANN_BENCH_TARGETS}
+            ARGS=${ARGS//--limit-bench-ann=$LIMIT_ANN_BENCH_TARGETS/}
+            ANN_BENCH_TARGETS=${LIMIT_ANN_BENCH_TARGETS}
         fi
     fi
 }
@@ -203,7 +203,7 @@ if (( ${NUMARGS} != 0 )); then
     cacheTool
     limitTests
     limitBench
-    limitCuannBench
+    limitAnnBench
     for a in ${ARGS}; do
         if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
             echo "Invalid option: ${a}"
@@ -334,30 +334,30 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then
     fi
 fi
 
-if hasArg bench || (( ${NUMARGS} == 0 )); then
-    BUILD_BENCH=ON
-    CMAKE_TARGET="${CMAKE_TARGET};${BENCH_TARGETS}"
+if hasArg bench-prims || (( ${NUMARGS} == 0 )); then
+    BUILD_PRIMS_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${PRIMS_BENCH_TARGETS}"
 
     # Force compile nn library when needed benchmark targets are specified
-    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_BENCH"*  ]]; then
+    if [[ $CMAKE_TARGET == *"CLUSTER_PRIMS_BENCH"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_PRIMS_BENCH"*  ]]; then
       echo "-- Enabling nearest neighbors lib for benchmarks"
       ENABLE_NN_DEPENDENCIES=ON
       COMPILE_NN_LIBRARY=ON
     fi
 
     # Force compile distance library when needed benchmark targets are specified
-    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_BENCH"* ]]; then
+    if [[ $CMAKE_TARGET == *"CLUSTER_PRIMS_BENCH"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_PRIMS_BENCH"* ]]; then
       echo "-- Enabling distance lib for benchmarks"
       COMPILE_DIST_LIBRARY=ON
     fi
 
 fi
 
-if hasArg cuann_bench || (( ${NUMARGS} == 0 )); then
-    BUILD_CUANN_BENCH=ON
-    CMAKE_TARGET="${CMAKE_TARGET};${CUANN_BENCH_TARGETS}"
+if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
+    BUILD_ANN_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
 
     # Force compile nn library when needed benchmark targets are specified
     if [[ $CMAKE_TARGET == *"_RAFT_"* ]]; then
@@ -413,7 +413,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench || hasArg cuann_bench; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -433,8 +433,8 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DRAFT_NVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
-          -DBUILD_BENCH=${BUILD_BENCH} \
-          -DBUILD_CUANN_BENCH=${BUILD_CUANN_BENCH} \
+          -DBUILD_PRIMS_BENCH=${BUILD_PRIMS_BENCH} \
+          -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
diff --git a/conda/recipes/libraft/build_libraft_tests.sh b/conda/recipes/libraft/build_libraft_tests.sh
index aa2c1b3e89..cc28f93fb8 100644
--- a/conda/recipes/libraft/build_libraft_tests.sh
+++ b/conda/recipes/libraft/build_libraft_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-./build.sh tests bench --allgpuarch --no-nvtx
+./build.sh tests bench-prims --allgpuarch --no-nvtx
 cmake --install cpp/build --component testing
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6c02cb2523..bc7fd3ea63 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -46,8 +46,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_SHARED_LIBS "Build raft shared libraries" ON)
 option(BUILD_TESTS "Build raft unit-tests" ON)
-option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
-option(BUILD_CUANN_BENCH "Build raft ann benchmarks" OFF)
+option(BUILD_PRIMS_BENCH "Build raft C++ benchmark tests" OFF)
+option(BUILD_ANN_BENCH "Build raft ann benchmarks" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -59,7 +59,7 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(RAFT_NVTX "Enable nvtx markers" OFF)
 
 set(RAFT_COMPILE_LIBRARIES_DEFAULT OFF)
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   set(RAFT_COMPILE_LIBRARIES_DEFAULT ON)
 endif()
 option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations"
@@ -78,7 +78,7 @@ option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss"
 
 option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
 
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
   # to have different values for the `Threads::Threads` target. Setting this flag ensures
   # `Threads::Threads` is the same value in first run and subsequent runs.
@@ -103,7 +103,7 @@ cmake_dependent_option(
 
 message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}")
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
-message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
+message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_PRIMS_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -198,7 +198,7 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
 endif()
 
-if(BUILD_BENCH)
+if(BUILD_PRIMS_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench()
 endif()
@@ -723,7 +723,7 @@ raft_export(
 # ##################################################################################################
 # * shared test/bench headers ------------------------------------------------
 
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   include(internal/CMakeLists.txt)
 endif()
 
@@ -737,13 +737,14 @@ endif()
 # ##################################################################################################
 # * build benchmark executable -----------------------------------------------
 
-if(BUILD_BENCH)
-  include(bench/CMakeLists.txt)
+if(BUILD_PRIMS_BENCH)
+  include(bench/prims/CMakeLists.txt)
 endif()
 
 # ##################################################################################################
-# * build cuann benchmark executable -----------------------------------------------
+# * build ann benchmark executable -----------------------------------------------
 
-if(BUILD_CUANN_BENCH)
-  include(cuann_bench/CMakeLists.txt)
+if(BUILD_ANN_BENCH)
+  message("GOT HERE!")
+  include(bench/ann/CMakeLists.txt)
 endif()
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
deleted file mode 100644
index f54be94068..0000000000
--- a/cpp/bench/CMakeLists.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# ##################################################################################################
-# * compiler function -----------------------------------------------------------------------------
-
-function(ConfigureBench)
-
-  set(options OPTIONAL DIST NN)
-  set(oneValueArgs NAME)
-  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
-
-  cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  set(BENCH_NAME ${ConfigureBench_NAME})
-
-  add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
-
-  target_link_libraries(
-    ${BENCH_NAME}
-    PRIVATE raft::raft
-            raft_internal
-            $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
-            $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
-            benchmark::benchmark
-            Threads::Threads
-            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-            $<TARGET_NAME_IF_EXISTS:conda_env>
-  )
-
-  set_target_properties(
-    ${BENCH_NAME}
-    PROPERTIES # set target compile options
-               INSTALL_RPATH "\$ORIGIN/../../../lib"
-               CXX_STANDARD 17
-               CXX_STANDARD_REQUIRED ON
-               CUDA_STANDARD 17
-               CUDA_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
-  )
-
-  target_compile_options(
-    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-  )
-
-  target_include_directories(${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>")
-
-  install(
-    TARGETS ${BENCH_NAME}
-    COMPONENT testing
-    DESTINATION bin/gbench/libraft
-    EXCLUDE_FROM_ALL
-  )
-
-endfunction()
-
-if(BUILD_BENCH)
-  ConfigureBench(
-    NAME CLUSTER_BENCH PATH bench/cluster/kmeans_balanced.cu bench/cluster/kmeans.cu bench/main.cpp
-    OPTIONAL DIST NN
-  )
-
-  ConfigureBench(
-    NAME
-    DISTANCE_BENCH
-    PATH
-    bench/distance/distance_cosine.cu
-    bench/distance/distance_exp_l2.cu
-    bench/distance/distance_l1.cu
-    bench/distance/distance_unexp_l2.cu
-    bench/distance/fused_l2_nn.cu
-    bench/distance/masked_nn.cu
-    bench/distance/kernels.cu
-    bench/main.cpp
-    OPTIONAL
-    DIST
-  )
-
-  ConfigureBench(
-    NAME
-    LINALG_BENCH
-    PATH
-    bench/linalg/add.cu
-    bench/linalg/map_then_reduce.cu
-    bench/linalg/matrix_vector_op.cu
-    bench/linalg/norm.cu
-    bench/linalg/normalize.cu
-    bench/linalg/reduce_cols_by_key.cu
-    bench/linalg/reduce_rows_by_key.cu
-    bench/linalg/reduce.cu
-    bench/main.cpp
-  )
-
-  ConfigureBench(
-    NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/matrix/select_k.cu
-    bench/main.cpp
-  )
-
-  ConfigureBench(
-    NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu
-    bench/main.cpp
-  )
-
-  ConfigureBench(NAME SPARSE_BENCH PATH bench/sparse/convert_csr.cu bench/main.cpp)
-
-  ConfigureBench(
-    NAME
-    NEIGHBORS_BENCH
-    PATH
-    bench/neighbors/knn/brute_force_float_int64_t.cu
-    bench/neighbors/knn/brute_force_float_uint32_t.cu
-    bench/neighbors/knn/ivf_flat_float_int64_t.cu
-    bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
-    bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-    bench/neighbors/knn/ivf_pq_float_uint64_t.cu
-    bench/neighbors/knn/ivf_pq_int8_t_uint64_t.cu
-    bench/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu
-    bench/neighbors/refine_float_uint64_t.cu
-    bench/neighbors/refine_uint8_t_uint64_t.cu
-    bench/main.cpp
-    OPTIONAL
-    DIST
-    NN
-  )
-endif()
diff --git a/cpp/cuann_bench/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
similarity index 52%
rename from cpp/cuann_bench/CMakeLists.txt
rename to cpp/bench/ann/CMakeLists.txt
index 9f26afaac6..2a0fce6458 100644
--- a/cpp/cuann_bench/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -15,50 +15,37 @@
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
-option(RAFT_CUANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" OFF)
-option(RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF)
-option(RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF)
-option(RAFT_CUANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" OFF)
-option(RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" OFF)
-option(RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
-option(RAFT_CUANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
-
-set(RAFT_CUANN_BENCH_USE_FAISS OFF)
-if(RAFT_CUANN_BENCH_USE_FAISS_BFKNN
-   OR RAFT_CUANN_BENCH_USE_FAISS_IVFPQ
-   OR RAFT_CUANN_BENCH_USE_FAISS_IFFLAT
+option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+
+set(RAFT_ANN_BENCH_USE_FAISS OFF)
+if(RAFT_ANN_BENCH_USE_FAISS_BFKNN
+   OR RAFT_ANN_BENCH_USE_FAISS_IVFPQ
+   OR RAFT_ANN_BENCH_USE_FAISS_IFFLAT
 )
-  set(RAFT_CUANN_BENCH_USE_FAISS ON)
+  set(RAFT_ANN_BENCH_USE_FAISS ON)
 endif()
 
-if(RAFT_CUANN_BENCH_USE_HNSWLIB)
+if(RAFT_ANN_BENCH_USE_HNSWLIB)
+  message("Using hnswlib")
   include(cmake/thirdparty/get_hnswlib.cmake)
 endif()
 
-set(RAFT_CUANN_BENCH_USE_RAFT OFF)
-if(RAFT_CUANN_BENCH_USE_RAFT_BFKNN
-   OR RAFT_CUANN_BENCH_USE_RAFT_IVFPQ
-   OR RAFT_CUANN_BENCH_USE_RAFT_IFFLAT
-)
-  set(RAFT_CUANN_BENCH_USE_RAFT ON)
-endif()
-
-if(NOT RAFT_CUANN_BENCH_USE_RAFT)
-  set(RAFT_COMPILE_DISTANCE_LIBRARY OFF)
-  set(RAFT_COMPILE_NN_LIBRARY OFF)
-  set(RAFT_ENABLE_NN_DEPENDENCIES OFF)
-endif()
-
-option(RAFT_CUANN_BENCH_USE_MULTIGPU "Use multi-gpus (where possible) in benchmarks" OFF)
+option(RAFT_ANN_BENCH_USE_MULTIGPU "Use multi-gpus (where possible) in benchmarks" OFF)
 
 include(cmake/thirdparty/get_nlohmann_json.cmake)
 
-if(RAFT_CUANN_BENCH_USE_GGNN)
+if(RAFT_ANN_BENCH_USE_GGNN)
   include(cmake/thirdparty/get_ggnn.cmake)
 endif()
 
-if(RAFT_CUANN_BENCH_USE_FAISS)
+if(RAFT_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss.cmake)
 endif()
 
@@ -71,16 +58,16 @@ function(ConfigureCuannBench)
     ConfigureCuannBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
   )
 
-  set(BENCH_NAME CUANN_BENCH_${ConfigureCuannBench_NAME})
+  set(BENCH_NAME ${ConfigureCuannBench_NAME}_ANN_BENCH)
 
   add_executable(
-    ${BENCH_NAME} ${ConfigureCuannBench_PATH} cuann_bench/src/conf.cpp cuann_bench/src/util.cpp
+    ${BENCH_NAME} ${ConfigureCuannBench_PATH} bench/ann/src/conf.cpp bench/ann/src/util.cpp
   )
   target_link_libraries(
     ${BENCH_NAME}
     PRIVATE raft::raft
             nlohmann_json::nlohmann_json
-            $<$<BOOL:${RAFT_CUANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
+            $<$<BOOL:${RAFT_ANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
             ${ConfigureCuannBench_LINKS}
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
@@ -106,11 +93,11 @@ function(ConfigureCuannBench)
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  if(RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME})
+  if(RAFT_ANN_BENCH_USE_${ConfigureCuannBench_NAME})
     target_compile_definitions(
       ${BENCH_NAME}
       PUBLIC
-        RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME}=RAFT_CUANN_BENCH_USE_${ConfigureCuannBench_NAME}
+        RAFT_ANN_BENCH_USE_${ConfigureCuannBench_NAME}=RAFT_ANN_BENCH_USE_${ConfigureCuannBench_NAME}
     )
   endif()
 
@@ -123,53 +110,53 @@ function(ConfigureCuannBench)
   install(
     TARGETS ${BENCH_NAME}
     COMPONENT testing
-    DESTINATION bin/cuann_bench
+    DESTINATION bin/ann
     EXCLUDE_FROM_ALL
   )
 endfunction()
 
-if(RAFT_CUANN_BENCH_USE_HNSWLIB)
+if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureCuannBench(
-    NAME HNSWLIB PATH cuann_bench/src/benchmark.cpp INCLUDES
+    NAME HNSWLIB PATH bench/ann/src/benchmark.cpp INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib/hnswlib CXXFLAGS -mavx
   )
 endif()
 
-if(RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ)
+if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ)
   ConfigureCuannBench(
-    NAME RAFT_IVF_PQ PATH cuann_bench/src/benchmark.cu cuann_bench/src/raft_ivf_pq.cu LINKS
+    NAME RAFT_IVF_PQ PATH bench/ann/src/benchmark.cu bench/ann/src/raft_ivf_pq.cu LINKS
     raft::distance raft::nn
   )
 endif()
 
-if(RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT)
+if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT)
   ConfigureCuannBench(
-    NAME RAFT_IVF_FLAT PATH cuann_bench/src/benchmark.cu cuann_bench/src/raft_ivf_flat.cu LINKS
+    NAME RAFT_IVF_FLAT PATH bench/ann/src/benchmark.cu bench/ann/src/raft_ivf_flat.cu LINKS
     raft::distance raft::nn
   )
 endif()
 
-if(RAFT_CUANN_BENCH_USE_RAFT_BFKNN)
+if(RAFT_ANN_BENCH_USE_RAFT_BFKNN)
   ConfigureCuannBench(
-    NAME RAFT_IVF_FLAT PATH cuann_bench/src/benchmark.cu LINKS raft::distance raft::nn
+    NAME RAFT_IVF_FLAT PATH bench/ann/src/benchmark.cu LINKS raft::distance raft::nn
   )
 endif()
 
-if(RAFT_CUANN_BENCH_USE_FAISS_IVF_FLAT)
-  ConfigureCuannBench(NAME FAISS_IVF_FLAT PATH cuann_bench/src/benchmark.cu LINKS faiss::faiss)
+if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT)
+  ConfigureCuannBench(NAME FAISS_IVF_FLAT PATH bench/ann/src/benchmark.cu LINKS faiss::faiss)
 endif()
 
-if(RAFT_CUANN_BENCH_USE_FAISS_IVF_PQ)
-  ConfigureCuannBench(NAME FAISS_IVF_PQ PATH cuann_bench/src/benchmark.cu LINKS faiss::faiss)
+if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ)
+  ConfigureCuannBench(NAME FAISS_IVF_PQ PATH bench/ann/src/benchmark.cu LINKS faiss::faiss)
 endif()
 
-if(RAFT_CUANN_BENCH_USE_FAISS_BFKNN)
-  ConfigureCuannBench(NAME FAISS_BFKNN PATH cuann_bench/src/benchmark.cu LINKS faiss::faiss)
+if(RAFT_ANN_BENCH_USE_FAISS_BFKNN)
+  ConfigureCuannBench(NAME FAISS_BFKNN PATH bench/ann/src/benchmark.cu LINKS faiss::faiss)
 endif()
 
-if(RAFT_CUANN_BENCH_USE_GGNN)
+if(RAFT_ANN_BENCH_USE_GGNN)
   ConfigureCuannBench(
-    NAME GGNN PATH cuann_bench/src/benchmark.cu INCLUDES
+    NAME GGNN PATH bench/ann/src/benchmark.cu INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include
   )
 endif()
diff --git a/cpp/cuann_bench/README.md b/cpp/bench/ann/README.md
similarity index 100%
rename from cpp/cuann_bench/README.md
rename to cpp/bench/ann/README.md
diff --git a/cpp/cuann_bench/conf/bigann-100M.json b/cpp/bench/ann/conf/bigann-100M.json
similarity index 100%
rename from cpp/cuann_bench/conf/bigann-100M.json
rename to cpp/bench/ann/conf/bigann-100M.json
diff --git a/cpp/cuann_bench/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json
similarity index 100%
rename from cpp/cuann_bench/conf/deep-100M.json
rename to cpp/bench/ann/conf/deep-100M.json
diff --git a/cpp/cuann_bench/conf/deep-1B.json b/cpp/bench/ann/conf/deep-1B.json
similarity index 100%
rename from cpp/cuann_bench/conf/deep-1B.json
rename to cpp/bench/ann/conf/deep-1B.json
diff --git a/cpp/cuann_bench/conf/glove-100-inner.json b/cpp/bench/ann/conf/glove-100-inner.json
similarity index 100%
rename from cpp/cuann_bench/conf/glove-100-inner.json
rename to cpp/bench/ann/conf/glove-100-inner.json
diff --git a/cpp/cuann_bench/conf/sift-128-euclidean.json b/cpp/bench/ann/conf/sift-128-euclidean.json
similarity index 100%
rename from cpp/cuann_bench/conf/sift-128-euclidean.json
rename to cpp/bench/ann/conf/sift-128-euclidean.json
diff --git a/cpp/cuann_bench/scripts/eval.pl b/cpp/bench/ann/scripts/eval.pl
similarity index 100%
rename from cpp/cuann_bench/scripts/eval.pl
rename to cpp/bench/ann/scripts/eval.pl
diff --git a/cpp/cuann_bench/scripts/fbin_to_f16bin.py b/cpp/bench/ann/scripts/fbin_to_f16bin.py
similarity index 100%
rename from cpp/cuann_bench/scripts/fbin_to_f16bin.py
rename to cpp/bench/ann/scripts/fbin_to_f16bin.py
diff --git a/cpp/cuann_bench/scripts/hdf5_to_fbin.py b/cpp/bench/ann/scripts/hdf5_to_fbin.py
similarity index 100%
rename from cpp/cuann_bench/scripts/hdf5_to_fbin.py
rename to cpp/bench/ann/scripts/hdf5_to_fbin.py
diff --git a/cpp/cuann_bench/scripts/split_groundtruth.pl b/cpp/bench/ann/scripts/split_groundtruth.pl
similarity index 100%
rename from cpp/cuann_bench/scripts/split_groundtruth.pl
rename to cpp/bench/ann/scripts/split_groundtruth.pl
diff --git a/cpp/cuann_bench/src/ann.h b/cpp/bench/ann/src/ann.h
similarity index 100%
rename from cpp/cuann_bench/src/ann.h
rename to cpp/bench/ann/src/ann.h
diff --git a/cpp/cuann_bench/src/benchmark.cpp b/cpp/bench/ann/src/benchmark.cpp
similarity index 100%
rename from cpp/cuann_bench/src/benchmark.cpp
rename to cpp/bench/ann/src/benchmark.cpp
diff --git a/cpp/cuann_bench/src/benchmark.cu b/cpp/bench/ann/src/benchmark.cu
similarity index 100%
rename from cpp/cuann_bench/src/benchmark.cu
rename to cpp/bench/ann/src/benchmark.cu
diff --git a/cpp/cuann_bench/src/conf.cpp b/cpp/bench/ann/src/conf.cpp
similarity index 100%
rename from cpp/cuann_bench/src/conf.cpp
rename to cpp/bench/ann/src/conf.cpp
diff --git a/cpp/cuann_bench/src/conf.h b/cpp/bench/ann/src/conf.h
similarity index 100%
rename from cpp/cuann_bench/src/conf.h
rename to cpp/bench/ann/src/conf.h
diff --git a/cpp/cuann_bench/src/cudart_util.h b/cpp/bench/ann/src/cudart_util.h
similarity index 100%
rename from cpp/cuann_bench/src/cudart_util.h
rename to cpp/bench/ann/src/cudart_util.h
diff --git a/cpp/cuann_bench/src/dataset.h b/cpp/bench/ann/src/dataset.h
similarity index 100%
rename from cpp/cuann_bench/src/dataset.h
rename to cpp/bench/ann/src/dataset.h
diff --git a/cpp/cuann_bench/src/factory.cuh b/cpp/bench/ann/src/factory.cuh
similarity index 92%
rename from cpp/cuann_bench/src/factory.cuh
rename to cpp/bench/ann/src/factory.cuh
index 03a5e176d6..5a6e6fc694 100644
--- a/cpp/cuann_bench/src/factory.cuh
+++ b/cpp/bench/ann/src/factory.cuh
@@ -26,26 +26,26 @@
 
 #include "ann.h"
 #undef WARP_SIZE
-#ifdef RAFT_CUANN_BENCH_USE_FAISS
+#ifdef RAFT_ANN_BENCH_USE_FAISS
 #include "faiss_wrapper.h"
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_GGNN
+#ifdef RAFT_ANN_BENCH_USE_GGNN
 #include "ggnn_wrapper.cuh"
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
 #include "raft_wrapper.h"
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
 #include "raft_ivf_flat_wrapper.h"
 extern template class cuann::RaftIvfFlatGpu<float, uint64_t>;
 extern template class cuann::RaftIvfFlatGpu<uint8_t, uint64_t>;
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
 #include "raft_ivf_pq_wrapper.h"
 extern template class cuann::RaftIvfPQ<float, uint64_t>;
 extern template class cuann::RaftIvfPQ<uint8_t, uint64_t>;
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_MULTI_GPU
+#ifdef RAFT_ANN_BENCH_USE_MULTI_GPU
 #include "multigpu.cuh"
 #endif
 #define JSON_DIAGNOSTICS 1
@@ -64,7 +64,7 @@ cuann::Metric parse_metric(const std::string& metric_str)
   }
 }
 
-#ifdef RAFT_CUANN_BENCH_USE_FAISS
+#ifdef RAFT_ANN_BENCH_USE_FAISS
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuann::FaissGpuIVFFlat<T>::BuildParam& param)
@@ -105,7 +105,7 @@ void parse_search_param(const nlohmann::json& conf, typename cuann::FaissGpu<T>:
 }
 #endif
 
-#ifdef RAFT_CUANN_BENCH_USE_GGNN
+#ifdef RAFT_ANN_BENCH_USE_GGNN
 template <typename T>
 void parse_build_param(const nlohmann::json& conf, typename cuann::Ggnn<T>::BuildParam& param)
 {
@@ -133,7 +133,7 @@ void parse_search_param(const nlohmann::json& conf, typename cuann::Ggnn<T>::Sea
 }
 #endif
 
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
@@ -154,7 +154,7 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuann::RaftIvfPQ<T, IdxT>::BuildParam& param)
@@ -222,7 +222,7 @@ std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric,
   typename Algo<T>::BuildParam param;
   parse_build_param<T>(conf, param);
 
-#ifdef RAFT_CUANN_BENCH_USE_MULTI_GPU
+#ifdef RAFT_ANN_BENCH_USE_MULTI_GPU
   if (dev_list.empty()) {
     return std::make_unique<Algo<T>>(metric, dim, param);
   } else {
@@ -244,10 +244,10 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
 {
   // stop compiler warning; not all algorithms support multi-GPU so it may not be used
   (void)dev_list;
-#ifndef RAFT_CUANN_BENCH_USE_MULTI_GPU
+#ifndef RAFT_ANN_BENCH_USE_MULTI_GPU
   if (!dev_list.empty()) {
     throw std::runtime_error(
-      "compiled without RAFT_CUANN_BENCH_USE_MULTI_GPU, but a device list is given");
+      "compiled without RAFT_ANN_BENCH_USE_MULTI_GPU, but a device list is given");
   }
 #endif
 
@@ -255,7 +255,7 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
   std::unique_ptr<cuann::ANN<T>> ann;
 
   if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_CUANN_BENCH_USE_FAISS
+#ifdef RAFT_ANN_BENCH_USE_FAISS
     if (algo == "faiss_gpu_ivf_flat") {
       ann = make_algo<T, cuann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
     } else if (algo == "faiss_gpu_ivf_pq") {
@@ -266,24 +266,24 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
       ann = std::make_unique<cuann::FaissGpuFlat<T>>(metric, dim);
     }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
     if (algo == "raft_bfknn") { ann = std::make_unique<cuann::RaftGpu<T>>(metric, dim); }
 #endif
   }
 
   if constexpr (std::is_same_v<T, uint8_t>) {}
 
-#ifdef RAFT_CUANN_BENCH_USE_GGNN
+#ifdef RAFT_ANN_BENCH_USE_GGNN
   if (algo == "ggnn") { ann = make_algo<T, cuann::Ggnn>(metric, dim, conf); }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
   if (algo == "raft_ivf_flat") {
     typename cuann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
     parse_build_param<T, uint64_t>(conf, param);
     ann = std::make_unique<cuann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
   }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
   if (algo == "raft_ivf_pq") {
     typename cuann::RaftIvfPQ<T, uint64_t>::BuildParam param;
     parse_build_param<T, uint64_t>(conf, param);
@@ -300,7 +300,7 @@ template <typename T>
 std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
   const std::string& algo, const nlohmann::json& conf)
 {
-#ifdef RAFT_CUANN_BENCH_USE_FAISS
+#ifdef RAFT_ANN_BENCH_USE_FAISS
   if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
     auto param = std::make_unique<typename cuann::FaissGpu<T>::SearchParam>();
     parse_search_param<T>(conf, *param);
@@ -310,27 +310,27 @@ std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
     return param;
   }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_BFKNN
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
   if (algo == "raft_bfknn") {
     auto param = std::make_unique<typename cuann::ANN<T>::AnnSearchParam>();
     return param;
   }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_GGNN
+#ifdef RAFT_ANN_BENCH_USE_GGNN
   if (algo == "ggnn") {
     auto param = std::make_unique<typename cuann::Ggnn<T>::SearchParam>();
     parse_search_param<T>(conf, *param);
     return param;
   }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_FLAT
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
   if (algo == "raft_ivf_flat") {
     auto param = std::make_unique<typename cuann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
     parse_search_param<T, uint64_t>(conf, *param);
     return param;
   }
 #endif
-#ifdef RAFT_CUANN_BENCH_USE_RAFT_IVF_PQ
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
   if (algo == "raft_ivf_pq") {
     auto param = std::make_unique<typename cuann::RaftIvfPQ<T, uint64_t>::SearchParam>();
     parse_search_param<T, uint64_t>(conf, *param);
diff --git a/cpp/cuann_bench/src/factory.h b/cpp/bench/ann/src/factory.h
similarity index 95%
rename from cpp/cuann_bench/src/factory.h
rename to cpp/bench/ann/src/factory.h
index 30d146d1b1..b4c3c7602d 100644
--- a/cpp/cuann_bench/src/factory.h
+++ b/cpp/bench/ann/src/factory.h
@@ -26,7 +26,7 @@
 
 #include "ann.h"
 #undef WARP_SIZE
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
 #include "hnswlib_wrapper.h"
 #endif
 #define JSON_DIAGNOSTICS 1
@@ -45,7 +45,7 @@ cuann::Metric parse_metric(const std::string& metric_str)
   }
 }
 
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
 template <typename T>
 void parse_build_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::BuildParam& param)
 {
@@ -98,13 +98,13 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
   std::unique_ptr<cuann::ANN<T>> ann;
 
   if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
     if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
 #endif
   }
 
   if constexpr (std::is_same_v<T, uint8_t>) {
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
     if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
 #endif
   }
@@ -119,7 +119,7 @@ template <typename T>
 std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
   const std::string& algo, const nlohmann::json& conf)
 {
-#ifdef RAFT_CUANN_BENCH_USE_HNSWLIB
+#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
   if (algo == "hnswlib") {
     auto param = std::make_unique<typename cuann::HnswLib<T>::SearchParam>();
     parse_search_param<T>(conf, *param);
diff --git a/cpp/cuann_bench/src/faiss_wrapper.h b/cpp/bench/ann/src/faiss_wrapper.h
similarity index 100%
rename from cpp/cuann_bench/src/faiss_wrapper.h
rename to cpp/bench/ann/src/faiss_wrapper.h
diff --git a/cpp/cuann_bench/src/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn_wrapper.cuh
similarity index 100%
rename from cpp/cuann_bench/src/ggnn_wrapper.cuh
rename to cpp/bench/ann/src/ggnn_wrapper.cuh
diff --git a/cpp/cuann_bench/src/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib_wrapper.h
similarity index 100%
rename from cpp/cuann_bench/src/hnswlib_wrapper.h
rename to cpp/bench/ann/src/hnswlib_wrapper.h
diff --git a/cpp/cuann_bench/src/multigpu.cuh b/cpp/bench/ann/src/multigpu.cuh
similarity index 100%
rename from cpp/cuann_bench/src/multigpu.cuh
rename to cpp/bench/ann/src/multigpu.cuh
diff --git a/cpp/cuann_bench/src/raft_cuann_utils.h b/cpp/bench/ann/src/raft_cuann_utils.h
similarity index 100%
rename from cpp/cuann_bench/src/raft_cuann_utils.h
rename to cpp/bench/ann/src/raft_cuann_utils.h
diff --git a/cpp/cuann_bench/src/raft_ivf_flat.cu b/cpp/bench/ann/src/raft_ivf_flat.cu
similarity index 100%
rename from cpp/cuann_bench/src/raft_ivf_flat.cu
rename to cpp/bench/ann/src/raft_ivf_flat.cu
diff --git a/cpp/cuann_bench/src/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
similarity index 100%
rename from cpp/cuann_bench/src/raft_ivf_flat_wrapper.h
rename to cpp/bench/ann/src/raft_ivf_flat_wrapper.h
diff --git a/cpp/cuann_bench/src/raft_ivf_pq.cu b/cpp/bench/ann/src/raft_ivf_pq.cu
similarity index 100%
rename from cpp/cuann_bench/src/raft_ivf_pq.cu
rename to cpp/bench/ann/src/raft_ivf_pq.cu
diff --git a/cpp/cuann_bench/src/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
similarity index 100%
rename from cpp/cuann_bench/src/raft_ivf_pq_wrapper.h
rename to cpp/bench/ann/src/raft_ivf_pq_wrapper.h
diff --git a/cpp/cuann_bench/src/raft_wrapper.h b/cpp/bench/ann/src/raft_wrapper.h
similarity index 100%
rename from cpp/cuann_bench/src/raft_wrapper.h
rename to cpp/bench/ann/src/raft_wrapper.h
diff --git a/cpp/cuann_bench/src/util.cpp b/cpp/bench/ann/src/util.cpp
similarity index 100%
rename from cpp/cuann_bench/src/util.cpp
rename to cpp/bench/ann/src/util.cpp
diff --git a/cpp/cuann_bench/src/util.h b/cpp/bench/ann/src/util.h
similarity index 100%
rename from cpp/cuann_bench/src/util.h
rename to cpp/bench/ann/src/util.h
diff --git a/cpp/cuann_bench/third_party/patches/ggnn.patch b/cpp/bench/ann/third_party/patches/ggnn.patch
similarity index 100%
rename from cpp/cuann_bench/third_party/patches/ggnn.patch
rename to cpp/bench/ann/third_party/patches/ggnn.patch
diff --git a/cpp/cuann_bench/third_party/patches/json.patch b/cpp/bench/ann/third_party/patches/json.patch
similarity index 100%
rename from cpp/cuann_bench/third_party/patches/json.patch
rename to cpp/bench/ann/third_party/patches/json.patch
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
new file mode 100644
index 0000000000..d90c2d9d3d
--- /dev/null
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -0,0 +1,142 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+function(ConfigurePrimsBench)
+
+  set(options OPTIONAL DIST NN)
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
+
+  cmake_parse_arguments(
+    ConfigurePrimsBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
+  )
+
+  set(BENCH_NAME ${ConfigurePrimsBench_NAME}_PRIMS_BENCH)
+
+  add_executable(${BENCH_NAME} ${ConfigurePrimsBench_PATH})
+
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            raft_internal
+            $<$<BOOL:${ConfigurePrimsBench_DIST}>:raft::distance>
+            $<$<BOOL:${ConfigurePrimsBench_NN}>:raft::nn>
+            benchmark::benchmark
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  target_include_directories(
+    ${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"
+  )
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/gbench/prims/libraft
+    EXCLUDE_FROM_ALL
+  )
+
+endfunction()
+
+if(BUILD_PRIMS_BENCH)
+  ConfigurePrimsBench(
+    NAME CLUSTER PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
+    bench/prims/main.cpp OPTIONAL DIST NN
+  )
+
+  ConfigurePrimsBench(
+    NAME
+    DISTANCE
+    PATH
+    bench/prims/distance/distance_cosine.cu
+    bench/prims/distance/distance_exp_l2.cu
+    bench/prims/distance/distance_l1.cu
+    bench/prims/distance/distance_unexp_l2.cu
+    bench/prims/distance/fused_l2_nn.cu
+    bench/prims/distance/masked_nn.cu
+    bench/prims/distance/kernels.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    DIST
+  )
+
+  ConfigurePrimsBench(
+    NAME
+    LINALG
+    PATH
+    bench/prims/linalg/add.cu
+    bench/prims/linalg/map_then_reduce.cu
+    bench/prims/linalg/matrix_vector_op.cu
+    bench/prims/linalg/norm.cu
+    bench/prims/linalg/normalize.cu
+    bench/prims/linalg/reduce_cols_by_key.cu
+    bench/prims/linalg/reduce_rows_by_key.cu
+    bench/prims/linalg/reduce.cu
+    bench/prims/main.cpp
+  )
+
+  ConfigurePrimsBench(
+    NAME MATRIX PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp
+  )
+
+  ConfigurePrimsBench(
+    NAME RANDOM PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
+    bench/prims/random/rng.cu bench/prims/main.cpp
+  )
+
+  ConfigurePrimsBench(NAME SPARSE PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
+
+  ConfigurePrimsBench(
+    NAME
+    NEIGHBORS
+    PATH
+    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
+    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
+    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_float_uint64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_int8_t_uint64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu
+    bench/prims/neighbors/refine_float_uint64_t.cu
+    bench/prims/neighbors/refine_uint8_t_uint64_t.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    DIST
+    NN
+  )
+endif()
diff --git a/cpp/bench/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu
similarity index 99%
rename from cpp/bench/cluster/kmeans.cu
rename to cpp/bench/prims/cluster/kmeans.cu
index 76b16cfe56..51347dacc5 100644
--- a/cpp/bench/cluster/kmeans.cu
+++ b/cpp/bench/prims/cluster/kmeans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
similarity index 98%
rename from cpp/bench/cluster/kmeans_balanced.cu
rename to cpp/bench/prims/cluster/kmeans_balanced.cu
index 9c53e86d8c..705021ddcd 100644
--- a/cpp/bench/cluster/kmeans_balanced.cu
+++ b/cpp/bench/prims/cluster/kmeans_balanced.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp
similarity index 100%
rename from cpp/bench/common/benchmark.hpp
rename to cpp/bench/prims/common/benchmark.hpp
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh
similarity index 100%
rename from cpp/bench/distance/distance_common.cuh
rename to cpp/bench/prims/distance/distance_common.cuh
diff --git a/cpp/bench/distance/distance_cosine.cu b/cpp/bench/prims/distance/distance_cosine.cu
similarity index 94%
rename from cpp/bench/distance/distance_cosine.cu
rename to cpp/bench/prims/distance/distance_cosine.cu
index 20f29ce4ef..c8ac8067c8 100644
--- a/cpp/bench/distance/distance_cosine.cu
+++ b/cpp/bench/prims/distance/distance_cosine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_exp_l2.cu b/cpp/bench/prims/distance/distance_exp_l2.cu
similarity index 94%
rename from cpp/bench/distance/distance_exp_l2.cu
rename to cpp/bench/prims/distance/distance_exp_l2.cu
index 5a3af17193..52b7fff05c 100644
--- a/cpp/bench/distance/distance_exp_l2.cu
+++ b/cpp/bench/prims/distance/distance_exp_l2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_l1.cu b/cpp/bench/prims/distance/distance_l1.cu
similarity index 93%
rename from cpp/bench/distance/distance_l1.cu
rename to cpp/bench/prims/distance/distance_l1.cu
index 2ad7d5e957..e80db48ef0 100644
--- a/cpp/bench/distance/distance_l1.cu
+++ b/cpp/bench/prims/distance/distance_l1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_unexp_l2.cu b/cpp/bench/prims/distance/distance_unexp_l2.cu
similarity index 94%
rename from cpp/bench/distance/distance_unexp_l2.cu
rename to cpp/bench/prims/distance/distance_unexp_l2.cu
index 406aca2378..7ac1a8a4b5 100644
--- a/cpp/bench/distance/distance_unexp_l2.cu
+++ b/cpp/bench/prims/distance/distance_unexp_l2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
similarity index 99%
rename from cpp/bench/distance/fused_l2_nn.cu
rename to cpp/bench/prims/distance/fused_l2_nn.cu
index 48473b2846..2c46bc3c78 100644
--- a/cpp/bench/distance/fused_l2_nn.cu
+++ b/cpp/bench/prims/distance/fused_l2_nn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
similarity index 100%
rename from cpp/bench/distance/kernels.cu
rename to cpp/bench/prims/distance/kernels.cu
diff --git a/cpp/bench/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
similarity index 100%
rename from cpp/bench/distance/masked_nn.cu
rename to cpp/bench/prims/distance/masked_nn.cu
diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/prims/linalg/add.cu
similarity index 96%
rename from cpp/bench/linalg/add.cu
rename to cpp/bench/prims/linalg/add.cu
index 7d00b8cbae..456214ad7b 100644
--- a/cpp/bench/linalg/add.cu
+++ b/cpp/bench/prims/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/prims/linalg/map_then_reduce.cu
similarity index 97%
rename from cpp/bench/linalg/map_then_reduce.cu
rename to cpp/bench/prims/linalg/map_then_reduce.cu
index 33a3e66264..84aebd85bf 100644
--- a/cpp/bench/linalg/map_then_reduce.cu
+++ b/cpp/bench/prims/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/prims/linalg/matrix_vector_op.cu
similarity index 99%
rename from cpp/bench/linalg/matrix_vector_op.cu
rename to cpp/bench/prims/linalg/matrix_vector_op.cu
index aa388955da..d1fbaee79b 100644
--- a/cpp/bench/linalg/matrix_vector_op.cu
+++ b/cpp/bench/prims/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu
similarity index 98%
rename from cpp/bench/linalg/norm.cu
rename to cpp/bench/prims/linalg/norm.cu
index efecee88c9..f83953f8e4 100644
--- a/cpp/bench/linalg/norm.cu
+++ b/cpp/bench/prims/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu
similarity index 98%
rename from cpp/bench/linalg/normalize.cu
rename to cpp/bench/prims/linalg/normalize.cu
index d01473ffeb..ad9052a008 100644
--- a/cpp/bench/linalg/normalize.cu
+++ b/cpp/bench/prims/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/prims/linalg/reduce.cu
similarity index 97%
rename from cpp/bench/linalg/reduce.cu
rename to cpp/bench/prims/linalg/reduce.cu
index 015e0b8abe..cf41c5916a 100644
--- a/cpp/bench/linalg/reduce.cu
+++ b/cpp/bench/prims/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
similarity index 98%
rename from cpp/bench/linalg/reduce_cols_by_key.cu
rename to cpp/bench/prims/linalg/reduce_cols_by_key.cu
index 43aeb69ab0..ac0c612ee4 100644
--- a/cpp/bench/linalg/reduce_cols_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
similarity index 98%
rename from cpp/bench/linalg/reduce_rows_by_key.cu
rename to cpp/bench/prims/linalg/reduce_rows_by_key.cu
index 075bc7c8c4..aa9c9a1f62 100644
--- a/cpp/bench/linalg/reduce_rows_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/main.cpp b/cpp/bench/prims/main.cpp
similarity index 92%
rename from cpp/bench/main.cpp
rename to cpp/bench/prims/main.cpp
index 3162422e8e..40f539facf 100644
--- a/cpp/bench/main.cpp
+++ b/cpp/bench/prims/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu
similarity index 100%
rename from cpp/bench/matrix/argmin.cu
rename to cpp/bench/prims/matrix/argmin.cu
diff --git a/cpp/bench/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
similarity index 100%
rename from cpp/bench/matrix/gather.cu
rename to cpp/bench/prims/matrix/gather.cu
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
similarity index 100%
rename from cpp/bench/matrix/select_k.cu
rename to cpp/bench/prims/matrix/select_k.cu
diff --git a/cpp/bench/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
similarity index 100%
rename from cpp/bench/neighbors/knn.cuh
rename to cpp/bench/prims/neighbors/knn.cuh
diff --git a/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
index d981104e20..7df0599670 100644
--- a/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
rename to cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
index 60f7edae96..9704d39e76 100644
--- a/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
+++ b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
index 594d4d16d2..fbbb4f9acc 100644
--- a/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
index bd268f036c..7067dbe1b6 100644
--- a/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_float_uint64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_float_uint64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_float_uint64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_float_uint64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_int8_t_uint64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_uint64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_int8_t_uint64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_uint64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu
diff --git a/cpp/bench/neighbors/refine.cuh b/cpp/bench/prims/neighbors/refine.cuh
similarity index 100%
rename from cpp/bench/neighbors/refine.cuh
rename to cpp/bench/prims/neighbors/refine.cuh
diff --git a/cpp/bench/neighbors/refine_float_uint64_t.cu b/cpp/bench/prims/neighbors/refine_float_uint64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/refine_float_uint64_t.cu
rename to cpp/bench/prims/neighbors/refine_float_uint64_t.cu
diff --git a/cpp/bench/neighbors/refine_uint8_t_uint64_t.cu b/cpp/bench/prims/neighbors/refine_uint8_t_uint64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/refine_uint8_t_uint64_t.cu
rename to cpp/bench/prims/neighbors/refine_uint8_t_uint64_t.cu
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/prims/random/make_blobs.cu
similarity index 98%
rename from cpp/bench/random/make_blobs.cu
rename to cpp/bench/prims/random/make_blobs.cu
index 950d80c499..f43d914cf2 100644
--- a/cpp/bench/random/make_blobs.cu
+++ b/cpp/bench/prims/random/make_blobs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/prims/random/permute.cu
similarity index 100%
rename from cpp/bench/random/permute.cu
rename to cpp/bench/prims/random/permute.cu
diff --git a/cpp/bench/random/rng.cu b/cpp/bench/prims/random/rng.cu
similarity index 98%
rename from cpp/bench/random/rng.cu
rename to cpp/bench/prims/random/rng.cu
index 147adf26ae..d15c9441d7 100644
--- a/cpp/bench/random/rng.cu
+++ b/cpp/bench/prims/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu
similarity index 100%
rename from cpp/bench/sparse/convert_csr.cu
rename to cpp/bench/prims/sparse/convert_csr.cu

From fc1c5e6b51c9b621b0ce9a756c5117ff9474d4d6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 25 Feb 2023 11:31:31 -0500
Subject: [PATCH 05/39] Cleaning up hnswlib cmake

---
 cpp/bench/ann/CMakeLists.txt           |  2 +-
 cpp/cmake/thirdparty/get_hnswlib.cmake | 18 ++----------------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 2a0fce6458..ce0c01e300 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -118,7 +118,7 @@ endfunction()
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureCuannBench(
     NAME HNSWLIB PATH bench/ann/src/benchmark.cpp INCLUDES
-    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib/hnswlib CXXFLAGS -mavx
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS -mavx
   )
 endif()
 
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 69ea99a006..72c17a84c0 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -21,26 +21,12 @@ function(find_and_configure_hnswlib)
 
     set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
 
-    if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib )
+    if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src )
 
         execute_process (
-                COMMAND mkdir hnswlib
+                COMMAND git clone --branch=v0.6.2 https://github.com/nmslib/hnswlib.git hnswlib-src
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
 
-        execute_process (
-                COMMAND wget https://github.com/nmslib/hnswlib/archive/refs/tags/v0.6.2.zip -O hnswlib-0.6.2.zip
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
-
-        execute_process (
-                COMMAND unzip hnswlib-0.6.2.zip
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
-        execute_process (
-                COMMAND mv -f hnswlib-0.6.2/hnswlib/ .
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
-        execute_process (
-                COMMAND rm -r hnswlib-0.6.2 hnswlib-0.6.2.zip
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib )
-
     endif ()
 endfunction()
 

From f34d6e8137c0fb876fb3a03fdf173760c03e21f6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 25 Feb 2023 13:10:56 -0500
Subject: [PATCH 06/39] Moving README.md from source code to docs

---
 .../source/cuda_ann_benchmarks.md             | 130 +++++++-----------
 docs/source/index.rst                         |   5 +-
 2 files changed, 53 insertions(+), 82 deletions(-)
 rename cpp/bench/ann/README.md => docs/source/cuda_ann_benchmarks.md (72%)

diff --git a/cpp/bench/ann/README.md b/docs/source/cuda_ann_benchmarks.md
similarity index 72%
rename from cpp/bench/ann/README.md
rename to docs/source/cuda_ann_benchmarks.md
index 5ed1c187b1..c36d9a11d7 100644
--- a/cpp/bench/ann/README.md
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -1,59 +1,24 @@
-# cuANN - CUDA Approximate Nearest Neighbor (ANN) Search
+# CUDA ANN Benchmarks
 
-This project provides a benchmark program for various ANN search implementations. It's especially suitable for GPU implementations.
-
-## Developer Guide
-
-Please read [CONTRIBUTING.md](CONTRIBUTING.md) before writing code for this project.
+This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations.
 
 ## Benchmark
 
-### Building
-Prerequisites for compiling the `benchmark` program:
-* CUDA >= 11.3
-* GCC >= 8.2 if RAFT is used (preferably GCC 9.5+)
-* NCCL >= 2.10 if multi-GPU support is enabled
-* FAISS (https://github.com/facebookresearch/faiss) if it's enabled
-* cmake in the search path of executable files, for automatically installing FAISS and glog (Google Logging Library, required by GGNN)
-* miscellaneous libraries that can be installed using Makefile
-
-#### installing misc. libraries
-Most of the libraries are optional, and they can be enabled by the CUANN_USE_XYZ flags in [benchmark/Makefile](benchmark/Makefile). They will be downloadad automatically.
-
-
-#### installing NCCL
-If `CUANN_USE_MULTI_GPU = 1` in `benchmark/Makefile`, NCCL is required.
-
-It's most convenient to install NCCL under `${cuann_bench_path}/third_party/nccl/`, like using "O/S agnostic local installer" downloaded from https://developer.nvidia.com/nccl/nccl-download. Otherwise, may need to modify `CPPFLAGS` and `LDFLAGS` in `benchmark/Makefile` to add include and library paths.
-
-
-#### installing FAISS library
-FAISS can be installed in many ways:
-* If `CUANN_USE_RAPIDS_CONTAINER = 1` in `benchmark/Makefile`, RAPIDS Docker container already has FAISS installed
-* If `CUANN_USE_RAPIDS_CONTAINER = 0` in `benchmark/Makefile`, FAISS will be installed automatically with the Makefile. However, it reqiures a BLAS implementation is available by setting either environmental paths or Makefile flags.
-* Sometimes, FAISS has already been installed in the system and we want to use that. Beside modifying `FAISS_PATH` in `benchmark/Makefile`, we also need to prevent Makefile from installing FAISS again. For that, change the line `faiss: faiss/lib/libfaiss.so` in `third_parth/Makefile` to `faiss:`.
+### Dependencies
 
-For manual installation: need to install FAISS from source. See [Building from source](https://github.com/facebookresearch/faiss/blob/master/INSTALL.md#building-from-source) for detailed steps.
+TODO: Need to fill in a conda environment file and direct users to it
 
-It's most convenient to install FAISS under `${cuann_bench_path}/third_party/faiss/`. Otherwise, may need to modify `FAISS_PATH` in `benchmark/Makefile`.
+### compiling benchmark
 
-An example of cmake build commands:
+The easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms:
+```bash
+./build.sh bench-ann
 ```
-mkdir build && cd build
-cmake -DFAISS_ENABLE_GPU=ON \
-  -DFAISS_ENABLE_PYTHON=OFF -DBUILD_TESTING=OFF \
-  -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CUDA_ARCHITECTURES="70;75;80;86" \
-  -DCMAKE_INSTALL_PREFIX=${cuann_bench_path}/third_party/faiss ..
-```
-
-
-
 
-#### compiling benchmark
-First, modify CUANN_USE_XXX flags at the top of benchmark/Makefile to enable desirable implementations. By default, none is enabled.
-
-Then, just run `cd benchmark && make -j`.
+You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`):
+```bash
+./build.sh bench-ann --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH
+```
 
 By default, the `benchmark` program accepts dataset of `float` type. To use other type, change the line `using data_t = float;` in `benchmark/src/benchmark.cu` to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
 
@@ -66,34 +31,38 @@ There are 4 steps to run the benchmark:
 4. evaluate result
 
 #### TL;DR
-A complete example:
+A complete example (run from the RAFT source code root directory):
 ```
 # (1) prepare a dataset
 pip3 install numpy h5py # if they have not been installed already
-cd benchmark
+pushd
+cd bench/ann
 mkdir data && cd data
 wget http://ann-benchmarks.com/glove-100-angular.hdf5
 # option -n is used here to normalize vectors so cosine distance is converted
 # to inner product; don't use -n for l2 distance
-../../script/hdf5_to_fbin.py -n glove-100-angular.hdf5
+python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5
 mkdir glove-100-inner
 mv glove-100-angular.base.fbin glove-100-inner/base.fbin
 mv glove-100-angular.query.fbin glove-100-inner/query.fbin
 mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin
 mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin
-cd ..
+popd
 
 # (2) build index
-./benchmark -b -i faiss_ivf_flat.nlist1024 conf/glove-100-inner.json
+./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json
 
 # (3) search
-./benchmark -s -i faiss_ivf_flat.nlist1024 conf/glove-100-inner.json
+./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json
 
 # (4) evaluate result
-../script/eval.pl \
+pushd
+cd bench/ann
+./scripts/eval.pl \
   -o result.csv \
   data/glove-100-inner/groundtruth.neighbors.ibin \
   result/glove-100-inner/faiss_ivf_flat
+popd 
 
 # optional step: plot QPS-Recall figure using data in result.csv with your favorite tool
 ```
@@ -105,19 +74,19 @@ A dataset usually has 4 binary files containing database vectors, query vectors,
 The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
 These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
 
-Some implementation, like Cagra, can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
 
 Commonly used datasets can be downloaded from two websites:
 1.  Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
 
-    However, these datasets are in HDF5 format. Use `script/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    However, these datasets are in HDF5 format. Use `bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
     ```
     pip3 install numpy h5py
     ```
     The usage of this script is:
     ```
-    $ script/hdf5_to_fbin.py
-    usage: script/hdf5_to_fbin.py [-n] <input>.hdf5
+    $ bench/ann/scripts/hdf5_to_fbin.py
+    usage: scripts/hdf5_to_fbin.py [-n] <input>.hdf5
        -n: normalize base/query set
      outputs: <input>.base.fbin
               <input>.query.fbin
@@ -131,27 +100,28 @@ Commonly used datasets can be downloaded from two websites:
 
 2.  Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
     ```
-    $ script/split_groundtruth.pl
+    $ bench/ann/scripts/split_groundtruth.pl
     usage: script/split_groundtruth.pl input output_prefix
     ```
     Take Deep-1B dataset as an example:
     ```
-    cd benchmark
+    pushd
+    cd bench/ann
     mkdir -p data/deep-1B && cd data/deep-1B
     # download manually "Ground Truth" file of "Yandex DEEP"
     # suppose the file name is deep_new_groundtruth.public.10K.bin
-    ../../../script/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
+    popd
     ```
     Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
 
 
 #### step 2: building index
-An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `benchmark -b` to build an index and save it to disk.
-
+An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk.
 
-To run `benchmark`, a JSON configuration file is required. Refer to [`benchmark/conf/glove-100-inner.json`](conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
-* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since `benchmark` program is for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
+To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`benchmark/conf/glove-100-inner.json`](../../cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
+* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
     - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset.
 * `search_basic_param` section specifies basic parameters for searching:
     - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching.
@@ -165,10 +135,10 @@ To run `benchmark`, a JSON configuration file is required. Refer to [`benchmark/
 
 
 
-The usage of `benchmark` can be found by running `benchmark -h`:
+The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables:
 ```
-$ ./benchmark -h
-usage: ./benchmark -b|s [-f] [-i index_names] conf.json
+$ ./*_ANN_BENCH -h
+usage: ./*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
    -b: build mode, will build index
    -s: search mode, will search using built index
        one and only one of -b and -s should be specified
@@ -204,31 +174,31 @@ It's easier to describe the usage of `-i` option with an example. Suppose we hav
 Then,
 ```
 # build all indices: hnsw1, hnsw2 and faiss
-./benchmark -b a.json
+./HNSWLIB_ANN_BENCH -b a.json
 
 # build only hnsw1
-./benchmark -b -i hnsw1 a.json
+./HNSWLIB_ANN_BENCH -b -i hnsw1 a.json
 
 # build hnsw1 and hnsw2
-./benchmark -b -i hnsw1,hnsw2 a.json
+./HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json
 
 # build hnsw1 and hnsw2
-./benchmark -b -i 'hnsw*' a.json
+./HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json
 
-# build hnsw1, hnsw2 and faiss
-./benchmark -b -i 'hnsw*,faiss' a.json
+# build faiss
+./FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json
 ```
 In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell.
 
 
 #### step 3: searching
-Use `benchmark -s`. Other options are the same as in step 2.
+Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2.
 
 
 #### step 4: evaluating results
-Use `script/eval.pl` to evaluate benchmark results. The usage is:
+Use `bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is:
 ```
-$ script/eval.pl
+$ bench/ann/scripts/eval.pl
 usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
   result_paths... are paths to the search result files.
     Can specify multiple paths.
@@ -241,7 +211,7 @@ usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
 Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files.
 An example:
 ```
-script/eval.pl groundtruth.neighbors.ibin \
+bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \
   result/glove-100-angular/10/hnsw/angular_M_24_*.txt \
   result/glove-100-angular/10/faiss/
 ```
@@ -254,7 +224,7 @@ It saves recall value in result txt file, so avoids to recompute recall if the s
 
 
 ## How to add a new ANN algorithm
-Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `src/cuann_bench/ann.h`) and implements all the pure virtual functions.
+Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `bench/ann/src/ann.h`) and implements all the pure virtual functions.
 
 In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
 ```
@@ -293,7 +263,7 @@ The benchmark program uses JSON configuration file. To add the new algorithm to
 },
 ```
 
-How to interpret these JSON objects is totally left to the implementation and should be specified in `benchmark/src/factory.cuh`:
+How to interpret these JSON objects is totally left to the implementation and should be specified in `bench/ann/src/factory.cuh`:
 * First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
     ```
     template<typename T>
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9890bd932f..fbe01fe889 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,5 +1,5 @@
-Welcome to RAFT's documentation!
-=================================
+RAPIDS RAFT
+===========
 
 RAFT contains fundamental widely-used algorithms and primitives for scientific computing, data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
@@ -44,6 +44,7 @@ While not exhaustive, the following general categories help summarize the accele
    developer_guide.md
    cpp_api.rst
    pylibraft_api.rst
+   cuda_ann_benchmarks.md
    raft_dask_api.rst
    using_comms.rst
    contributing.md

From 30cc9f1316657c55e066134d546ed0d39d3dceae Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 25 Feb 2023 13:17:20 -0500
Subject: [PATCH 07/39] More docs upates

---
 docs/source/cuda_ann_benchmarks.md | 32 ++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md
index c36d9a11d7..ac96c0d0cc 100644
--- a/docs/source/cuda_ann_benchmarks.md
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -20,7 +20,17 @@ You can limit the algorithms that are built by providing a semicolon-delimited l
 ./build.sh bench-ann --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH
 ```
 
-By default, the `benchmark` program accepts dataset of `float` type. To use other type, change the line `using data_t = float;` in `benchmark/src/benchmark.cu` to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
+Available targets to use with `--limit-bench-ann` are:
+- FAISS_IVF_FLAT_ANN_BENCH
+- FAISS_IVF_PQ_ANN_BENCH
+- FAISS_BFKNN_ANN_BENCH
+- GGNN_ANN_BENCH
+- HNSWLIB_ANN_BENCH
+- RAFT_IVF_PQ_ANN_BENCH
+- RAFT_IVF_FLAT_ANN_BENCH
+- RAFT_BFKNN_ANN_BENCH
+
+By default, the `*_ANN_BENCH` executables program accept dataset of `float` type. To use other type, change the line `using data_t = float;` in `bench/ann/src/benchmark.cu` (or `bench/ann/src/benchmark/cpp` if benchmarking a non-CUDA algorithm) to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
 
 
 ### Usage
@@ -120,7 +130,7 @@ Commonly used datasets can be downloaded from two websites:
 #### step 2: building index
 An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk.
 
-To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`benchmark/conf/glove-100-inner.json`](../../cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
+To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`bench/ann/conf/glove-100-inner.json`](../../cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
 * `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
     - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset.
 * `search_basic_param` section specifies basic parameters for searching:
@@ -137,8 +147,8 @@ To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configurat
 
 The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables:
 ```
-$ ./*_ANN_BENCH -h
-usage: ./*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
+$ ./cpp/build/*_ANN_BENCH -h
+usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
    -b: build mode, will build index
    -s: search mode, will search using built index
        one and only one of -b and -s should be specified
@@ -151,8 +161,8 @@ usage: ./*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
 ```
 * `-b`: build index.
 * `-s`: do the searching with built index.
-* `-f`: before doing the real task, `benchmark` checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option.
-* `-i`: by default, `benchmark -b` will build all indices found in the configuration file, and `benchmark -s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option.
+* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option.
+* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option.
 
 It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains:
 ```
@@ -174,19 +184,19 @@ It's easier to describe the usage of `-i` option with an example. Suppose we hav
 Then,
 ```
 # build all indices: hnsw1, hnsw2 and faiss
-./HNSWLIB_ANN_BENCH -b a.json
+./cpp/build/HNSWLIB_ANN_BENCH -b a.json
 
 # build only hnsw1
-./HNSWLIB_ANN_BENCH -b -i hnsw1 a.json
+./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json
 
 # build hnsw1 and hnsw2
-./HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json
+./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json
 
 # build hnsw1 and hnsw2
-./HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json
+./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json
 
 # build faiss
-./FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json
+./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json
 ```
 In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell.
 

From e93ede734822e33e404462be2ce147e964c98bfe Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 28 Feb 2023 13:47:58 -0500
Subject: [PATCH 08/39] Breaking benchmarks apart to make them easier to
 maintain

---
 cpp/bench/ann/CMakeLists.txt                  |  50 +-
 cpp/bench/ann/src/ann.h                       |   4 +-
 cpp/bench/ann/src/benchmark.cu                | 555 ------------------
 .../ann/src/{benchmark.cpp => benchmark.hpp}  |  70 ++-
 cpp/bench/ann/src/benchmark_util.hpp          |  42 ++
 cpp/bench/ann/src/cudart_util.h               |  16 +-
 cpp/bench/ann/src/factory.cuh                 | 345 -----------
 cpp/bench/ann/src/faiss_benchmark.cu          |  19 +
 cpp/bench/ann/src/faiss_benchmark.cuh         | 148 +++++
 cpp/bench/ann/src/faiss_wrapper.h             |  10 +-
 cpp/bench/ann/src/ggnn_benchmark.cu           |  19 +
 cpp/bench/ann/src/ggnn_benchmark.cuh          | 123 ++++
 cpp/bench/ann/src/ggnn_wrapper.cuh            |   4 +-
 cpp/bench/ann/src/hnswlib_benchmark.cpp       |  19 +
 .../src/{factory.h => hnswlib_benchmark.hpp}  |  59 +-
 cpp/bench/ann/src/hnswlib_wrapper.h           |   4 +-
 cpp/bench/ann/src/multigpu.cuh                |   4 +-
 ...t_cuann_utils.h => raft_ann_bench_utils.h} |  10 +-
 cpp/bench/ann/src/raft_benchmark.cu           |  19 +
 cpp/bench/ann/src/raft_benchmark.cuh          | 212 +++++++
 cpp/bench/ann/src/raft_ivf_flat.cu            |   4 +-
 cpp/bench/ann/src/raft_ivf_flat_wrapper.h     |   6 +-
 cpp/bench/ann/src/raft_ivf_pq.cu              |   4 +-
 cpp/bench/ann/src/raft_ivf_pq_wrapper.h       |   6 +-
 cpp/bench/ann/src/raft_wrapper.h              |  10 +-
 docs/source/cuda_ann_benchmarks.md            |  26 +-
 26 files changed, 741 insertions(+), 1047 deletions(-)
 delete mode 100644 cpp/bench/ann/src/benchmark.cu
 rename cpp/bench/ann/src/{benchmark.cpp => benchmark.hpp} (90%)
 create mode 100644 cpp/bench/ann/src/benchmark_util.hpp
 delete mode 100644 cpp/bench/ann/src/factory.cuh
 create mode 100644 cpp/bench/ann/src/faiss_benchmark.cu
 create mode 100644 cpp/bench/ann/src/faiss_benchmark.cuh
 create mode 100644 cpp/bench/ann/src/ggnn_benchmark.cu
 create mode 100644 cpp/bench/ann/src/ggnn_benchmark.cuh
 create mode 100644 cpp/bench/ann/src/hnswlib_benchmark.cpp
 rename cpp/bench/ann/src/{factory.h => hnswlib_benchmark.hpp} (56%)
 rename cpp/bench/ann/src/{raft_cuann_utils.h => raft_ann_bench_utils.h} (83%)
 create mode 100644 cpp/bench/ann/src/raft_benchmark.cu
 create mode 100644 cpp/bench/ann/src/raft_benchmark.cuh

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index ce0c01e300..dca63538b7 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -32,6 +32,14 @@ if(RAFT_ANN_BENCH_USE_FAISS_BFKNN
   set(RAFT_ANN_BENCH_USE_FAISS ON)
 endif()
 
+set(RAFT_ANN_BENCH_USE_RAFT OFF)
+if(RAFT_ANN_BENCH_USE_RAFT_BFKNN
+   OR RAFT_ANN_BENCH_USE_RAFT_IVFPQ
+   OR RAFT_ANN_BENCH_USE_RAFT_IVFFLAT
+)
+  set(RAFT_ANN_BENCH_USE_RAFT ON)
+endif()
+
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   message("Using hnswlib")
   include(cmake/thirdparty/get_hnswlib.cmake)
@@ -117,46 +125,32 @@ endfunction()
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureCuannBench(
-    NAME HNSWLIB PATH bench/ann/src/benchmark.cpp INCLUDES
+    NAME HNSWLIB PATH bench/ann/src/hnswlib_benchmark.cpp INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS -mavx
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ)
-  ConfigureCuannBench(
-    NAME RAFT_IVF_PQ PATH bench/ann/src/benchmark.cu bench/ann/src/raft_ivf_pq.cu LINKS
-    raft::distance raft::nn
-  )
-endif()
-
-if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT)
-  ConfigureCuannBench(
-    NAME RAFT_IVF_FLAT PATH bench/ann/src/benchmark.cu bench/ann/src/raft_ivf_flat.cu LINKS
-    raft::distance raft::nn
-  )
-endif()
-
-if(RAFT_ANN_BENCH_USE_RAFT_BFKNN)
+if(RAFT_ANN_BENCH_USE_RAFT)
   ConfigureCuannBench(
-    NAME RAFT_IVF_FLAT PATH bench/ann/src/benchmark.cu LINKS raft::distance raft::nn
+    NAME
+    RAFT_IVF_PQ
+    PATH
+    bench/ann/src/raft_benchmark.cu
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft_ivf_pq.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft_ivf_flat.cu>
+    LINKS
+    raft::distance
+    raft::nn
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT)
-  ConfigureCuannBench(NAME FAISS_IVF_FLAT PATH bench/ann/src/benchmark.cu LINKS faiss::faiss)
-endif()
-
-if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ)
-  ConfigureCuannBench(NAME FAISS_IVF_PQ PATH bench/ann/src/benchmark.cu LINKS faiss::faiss)
-endif()
-
-if(RAFT_ANN_BENCH_USE_FAISS_BFKNN)
-  ConfigureCuannBench(NAME FAISS_BFKNN PATH bench/ann/src/benchmark.cu LINKS faiss::faiss)
+if(RAFT_ANN_BENCH_USE_FAISS)
+  ConfigureCuannBench(NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss_benchmark.cu LINKS faiss::faiss)
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)
   ConfigureCuannBench(
-    NAME GGNN PATH bench/ann/src/benchmark.cu INCLUDES
+    NAME GGNN PATH bench/ann/src/ggnn_benchmark.cu INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include
   )
 endif()
diff --git a/cpp/bench/ann/src/ann.h b/cpp/bench/ann/src/ann.h
index fae1fe3977..105688228d 100644
--- a/cpp/bench/ann/src/ann.h
+++ b/cpp/bench/ann/src/ann.h
@@ -21,7 +21,7 @@
 
 #include <cuda_runtime_api.h>
 
-namespace cuann {
+namespace raft::bench::ann {
 
 enum class Metric {
   kInnerProduct,
@@ -84,6 +84,6 @@ class ANN {
   int dim_;
 };
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif  // ANN_H_
diff --git a/cpp/bench/ann/src/benchmark.cu b/cpp/bench/ann/src/benchmark.cu
deleted file mode 100644
index f71650f383..0000000000
--- a/cpp/bench/ann/src/benchmark.cu
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifdef NVTX
-#include <nvToolsExt.h>
-#endif
-#include <unistd.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "conf.h"
-#include "dataset.h"
-#include "factory.cuh"
-#include "util.h"
-
-using std::cerr;
-using std::cout;
-using std::endl;
-using std::string;
-using std::to_string;
-using std::unordered_set;
-using std::vector;
-using namespace benchmark;
-using cuann::MemoryType;
-
-// supported types: float, half (very few implementations support it), uint8_t, int8_t
-using data_t = float;
-
-bool check_file_exist(const vector<string>& files)
-{
-  bool ret = true;
-  unordered_set<string> processed;
-  for (const auto& file : files) {
-    if (processed.find(file) == processed.end() && !file_exists(file)) {
-      log_error("file '%s' doesn't exist or is not a regular file", file.c_str());
-      ret = false;
-    }
-    processed.insert(file);
-  }
-  return ret;
-}
-
-bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
-{
-  bool ret = true;
-  for (const auto& file : files) {
-    if (file_exists(file)) {
-      if (force_overwrite) {
-        log_warn("'%s' already exists, will overwrite it", file.c_str());
-      } else {
-        log_error("'%s' already exists, use '-f' to force overwriting", file.c_str());
-        ret = false;
-      }
-    }
-  }
-  return ret;
-}
-
-bool check_no_duplicate_file(const vector<string>& files)
-{
-  bool ret = true;
-  unordered_set<string> processed;
-  for (const auto& file : files) {
-    if (processed.find(file) != processed.end()) {
-      log_error("'%s' occurs more than once as output file, would be overwritten", file.c_str());
-      ret = false;
-    }
-    processed.insert(file);
-  }
-  return ret;
-}
-
-bool mkdir(const vector<string>& dirs)
-{
-  unordered_set<string> processed;
-  for (const auto& dir : dirs) {
-    if (processed.find(dir) == processed.end() && !dir_exists(dir)) {
-      if (create_dir(dir)) {
-        log_info("mkdir '%s'", dir.c_str());
-      } else {
-        log_error("fail to create output directory '%s'", dir.c_str());
-        // won't create any other dir when problem occurs
-        return false;
-      }
-    }
-    processed.insert(dir);
-  }
-  return true;
-}
-
-bool check(const vector<Configuration::Index>& indices, bool build_mode, bool force_overwrite)
-{
-  vector<string> files_should_exist;
-  vector<string> dirs_should_exist;
-  vector<string> output_files;
-  for (const auto& index : indices) {
-    if (build_mode) {
-      output_files.push_back(index.file);
-      output_files.push_back(index.file + ".txt");
-
-      auto pos = index.file.rfind('/');
-      if (pos != string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
-    } else {
-      files_should_exist.push_back(index.file);
-      files_should_exist.push_back(index.file + ".txt");
-
-      output_files.push_back(index.search_result_file + ".0.ibin");
-      output_files.push_back(index.search_result_file + ".0.txt");
-
-      auto pos = index.search_result_file.rfind('/');
-      if (pos != string::npos) {
-        dirs_should_exist.push_back(index.search_result_file.substr(0, pos));
-      }
-    }
-  }
-
-  bool ret = true;
-  if (!check_file_exist(files_should_exist)) { ret = false; }
-  if (!check_file_not_exist(output_files, force_overwrite)) { ret = false; }
-  if (!check_no_duplicate_file(output_files)) { ret = false; }
-  if (ret && !mkdir(dirs_should_exist)) { ret = false; }
-  return ret;
-}
-
-void write_build_info(const string& file_prefix,
-                      const string& dataset,
-                      const string& distance,
-                      const string& name,
-                      const string& algo,
-                      const string& build_param,
-                      float build_time)
-{
-  std::ofstream ofs(file_prefix + ".txt");
-  if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); }
-  ofs << "dataset: " << dataset << "\n"
-      << "distance: " << distance << "\n"
-      << "\n"
-      << "name: " << name << "\n"
-      << "algo: " << algo << "\n"
-      << "build_param: " << build_param << "\n"
-      << "build_time: " << build_time << endl;
-  ofs.close();
-  if (!ofs) { throw std::runtime_error("can't write to build info file: " + file_prefix + ".txt"); }
-}
-
-template <typename T>
-void build(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
-{
-  cudaStream_t stream;
-  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
-
-  log_info(
-    "base set from dataset '%s', #vector = %zu", dataset->name().c_str(), dataset->base_set_size());
-
-  for (const auto& index : indices) {
-    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
-    auto algo          = create_algo<T>(index.algo,
-                               dataset->distance(),
-                               dataset->dim(),
-                               index.refine_ratio,
-                               index.build_param,
-                               index.dev_list);
-    auto algo_property = algo->get_property();
-
-    const T* base_set_ptr = nullptr;
-    if (algo_property.dataset_memory_type == MemoryType::Host) {
-      log_info("%s", "loading base set to memory");
-      base_set_ptr = dataset->base_set();
-    } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
-      log_info("%s", "mapping base set to memory");
-      base_set_ptr = dataset->mapped_base_set();
-    } else if (algo_property.dataset_memory_type == MemoryType::Device) {
-      log_info("%s", "loading base set to GPU");
-      base_set_ptr = dataset->base_set_on_gpu();
-    }
-
-    log_info("building index '%s'", index.name.c_str());
-    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
-#ifdef NVTX
-    nvtxRangePush("build");
-#endif
-    Timer timer;
-    algo->build(base_set_ptr, dataset->base_set_size(), stream);
-    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
-    float elapsed_ms = timer.elapsed_ms();
-#ifdef NVTX
-    nvtxRangePop();
-#endif
-    log_info("built index in %.2f seconds", elapsed_ms / 1000.0f);
-    ANN_CUDA_CHECK_LAST_ERROR();
-
-    algo->save(index.file);
-    write_build_info(index.file,
-                     dataset->name(),
-                     dataset->distance(),
-                     index.name,
-                     index.algo,
-                     index.build_param.dump(),
-                     elapsed_ms / 1000.0f);
-    log_info("saved index to %s", index.file.c_str());
-  }
-
-  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-void write_search_result(const string& file_prefix,
-                         const string& dataset,
-                         const string& distance,
-                         const string& name,
-                         const string& algo,
-                         const string& build_param,
-                         const string& search_param,
-                         int batch_size,
-                         int run_count,
-                         int k,
-                         float search_time_average,
-                         float search_time_p99,
-                         float search_time_p999,
-                         const int* neighbors,
-                         size_t query_set_size)
-{
-  std::ofstream ofs(file_prefix + ".txt");
-  if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); }
-  ofs << "dataset: " << dataset << "\n"
-      << "distance: " << distance << "\n"
-      << "\n"
-      << "name: " << name << "\n"
-      << "algo: " << algo << "\n"
-      << "build_param: " << build_param << "\n"
-      << "search_param: " << search_param << "\n"
-      << "\n"
-      << "batch_size: " << batch_size << "\n"
-      << "run_count: " << run_count << "\n"
-      << "k: " << k << "\n"
-      << "average_search_time: " << search_time_average << endl;
-  if (search_time_p99 != std::numeric_limits<float>::max()) {
-    ofs << "p99_search_time: " << search_time_p99 << endl;
-  }
-  if (search_time_p999 != std::numeric_limits<float>::max()) {
-    ofs << "p999_search_time: " << search_time_p999 << endl;
-  }
-  ofs.close();
-  if (!ofs) {
-    throw std::runtime_error("can't write to search result file: " + file_prefix + ".txt");
-  }
-
-  BinFile<int> neighbors_file(file_prefix + ".ibin", "w");
-  neighbors_file.write(neighbors, query_set_size, k);
-}
-
-template <typename T>
-void search(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
-{
-  if (indices.empty()) { return; }
-  cudaStream_t stream;
-  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
-
-  log_info("loading query set from dataset '%s', #vector = %zu",
-           dataset->name().c_str(),
-           dataset->query_set_size());
-  const T* query_set = dataset->query_set();
-  // query set is usually much smaller than base set, so load it eagerly
-  const T* d_query_set  = dataset->query_set_on_gpu();
-  size_t query_set_size = dataset->query_set_size();
-
-  // currently all indices has same batch_size, k and run_count
-  const int batch_size = indices[0].batch_size;
-  const int k          = indices[0].k;
-  const int run_count  = indices[0].run_count;
-  log_info(
-    "basic search parameters: batch_size = %d, k = %d, run_count = %d", batch_size, k, run_count);
-  if (query_set_size % batch_size != 0) {
-    log_warn("query set size (%zu) % batch size (%d) != 0, the size of last batch is %zu",
-             query_set_size,
-             batch_size,
-             query_set_size % batch_size);
-  }
-  const size_t num_batches = (query_set_size - 1) / batch_size + 1;
-  size_t* neighbors        = new size_t[query_set_size * k];
-  int* neighbors_buf       = new int[query_set_size * k];
-  float* distances         = new float[query_set_size * k];
-  vector<float> search_times;
-  search_times.reserve(num_batches);
-  size_t* d_neighbors;
-  float* d_distances;
-  ANN_CUDA_CHECK(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
-  ANN_CUDA_CHECK(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
-
-  for (const auto& index : indices) {
-    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
-    auto algo          = create_algo<T>(index.algo,
-                               dataset->distance(),
-                               dataset->dim(),
-                               index.refine_ratio,
-                               index.build_param,
-                               index.dev_list);
-    auto algo_property = algo->get_property();
-
-    log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str());
-    algo->load(index.file);
-
-    const T* this_query_set = query_set;
-    size_t* this_neighbors  = neighbors;
-    float* this_distances   = distances;
-    if (algo_property.query_memory_type == MemoryType::Device) {
-      this_query_set = d_query_set;
-      this_neighbors = d_neighbors;
-      this_distances = d_distances;
-    }
-
-    if (algo_property.need_dataset_when_search) {
-      log_info("loading base set from dataset '%s', #vector = %zu",
-               dataset->name().c_str(),
-               dataset->base_set_size());
-      const T* base_set_ptr = nullptr;
-      if (algo_property.dataset_memory_type == MemoryType::Host) {
-        log_info("%s", "loading base set to memory");
-        base_set_ptr = dataset->base_set();
-      } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
-        log_info("%s", "mapping base set to memory");
-        base_set_ptr = dataset->mapped_base_set();
-      } else if (algo_property.dataset_memory_type == MemoryType::Device) {
-        log_info("%s", "loading base set to GPU");
-        base_set_ptr = dataset->base_set_on_gpu();
-      }
-      algo->set_search_dataset(base_set_ptr, dataset->base_set_size());
-    }
-
-    for (int i = 0, end_i = index.search_params.size(); i != end_i; ++i) {
-      auto p_param = create_search_param<T>(index.algo, index.search_params[i]);
-      algo->set_search_param(*p_param);
-      log_info("search with param: %s", index.search_params[i].dump().c_str());
-
-      if (algo_property.query_memory_type == MemoryType::Device) {
-        ANN_CUDA_CHECK(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
-        ANN_CUDA_CHECK(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
-      } else {
-        memset(neighbors, 0, query_set_size * k * sizeof(*neighbors));
-        memset(distances, 0, query_set_size * k * sizeof(*distances));
-      }
-
-      float best_search_time_average = std::numeric_limits<float>::max();
-      float best_search_time_p99     = std::numeric_limits<float>::max();
-      float best_search_time_p999    = std::numeric_limits<float>::max();
-      for (int run = 0; run < run_count; ++run) {
-        log_info("run %d / %d", run + 1, run_count);
-        for (size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
-          size_t row            = batch_id * batch_size;
-          int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
-          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
-#ifdef NVTX
-          string nvtx_label = "batch" + to_string(batch_id);
-          if (run_count != 1) { nvtx_label = "run" + to_string(run) + "-" + nvtx_label; }
-          if (batch_id == 10) {
-            run = run_count - 1;
-            break;
-          }
-#endif
-          Timer timer;
-#ifdef NVTX
-          nvtxRangePush(nvtx_label.c_str());
-#endif
-          algo->search(this_query_set + row * dataset->dim(),
-                       actual_batch_size,
-                       k,
-                       this_neighbors + row * k,
-                       this_distances + row * k,
-                       stream);
-          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
-          float elapsed_ms = timer.elapsed_ms();
-#ifdef NVTX
-          nvtxRangePop();
-#endif
-          // If the size of the last batch is less than batch_size, don't count it for
-          // search time. But neighbors of the last batch will still be filled, so it's
-          // counted for recall calculation.
-          if (actual_batch_size == batch_size) {
-            search_times.push_back(elapsed_ms / 1000.0f);  // in seconds
-          }
-        }
-
-        float search_time_average =
-          std::accumulate(search_times.cbegin(), search_times.cend(), 0.0f) / search_times.size();
-        best_search_time_average = std::min(best_search_time_average, search_time_average);
-
-        if (search_times.size() >= 100) {
-          std::sort(search_times.begin(), search_times.end());
-
-          auto calc_percentile_pos = [](float percentile, size_t N) {
-            return static_cast<size_t>(std::ceil(percentile / 100.0 * N)) - 1;
-          };
-
-          float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())];
-          best_search_time_p99  = std::min(best_search_time_p99, search_time_p99);
-
-          if (search_times.size() >= 1000) {
-            float search_time_p999 = search_times[calc_percentile_pos(99.9, search_times.size())];
-            best_search_time_p999  = std::min(best_search_time_p999, search_time_p999);
-          }
-        }
-        search_times.clear();
-      }
-      ANN_CUDA_CHECK_LAST_ERROR();
-
-      if (algo_property.query_memory_type == MemoryType::Device) {
-        ANN_CUDA_CHECK(cudaMemcpy(neighbors,
-                                  d_neighbors,
-                                  query_set_size * k * sizeof(*d_neighbors),
-                                  cudaMemcpyDeviceToHost));
-        ANN_CUDA_CHECK(cudaMemcpy(distances,
-                                  d_distances,
-                                  query_set_size * k * sizeof(*d_distances),
-                                  cudaMemcpyDeviceToHost));
-      }
-
-      for (size_t j = 0; j < query_set_size * k; ++j) {
-        neighbors_buf[j] = neighbors[j];
-      }
-      write_search_result(index.search_result_file + "." + to_string(i),
-                          dataset->name(),
-                          dataset->distance(),
-                          index.name,
-                          index.algo,
-                          index.build_param.dump(),
-                          index.search_params[i].dump(),
-                          batch_size,
-                          index.run_count,
-                          k,
-                          best_search_time_average,
-                          best_search_time_p99,
-                          best_search_time_p999,
-                          neighbors_buf,
-                          query_set_size);
-    }
-
-    log_info("finish searching for index '%s'", index.name.c_str());
-  }
-
-  delete[] neighbors;
-  delete[] neighbors_buf;
-  delete[] distances;
-  ANN_CUDA_CHECK(cudaFree(d_neighbors));
-  ANN_CUDA_CHECK(cudaFree(d_distances));
-  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-const string usage(const string& argv0)
-{
-  return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
-         "   -b: build mode, will build index\n" +
-         "   -s: search mode, will search using built index\n" +
-         "       one and only one of -b and -s should be specified\n" +
-         "   -c: just check command line options and conf.json are sensible\n" +
-         "       won't build or search\n" + "   -f: force overwriting existing output files\n" +
-         "   -i: by default will build/search all the indices found in conf.json\n" +
-         "       '-i' can be used to select a subset of indices\n" +
-         "       'index_names' is a list of comma-separated index names\n" +
-         "       '*' is allowed as the last character of a name to select all matched indices\n" +
-         "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
-}
-
-int main(int argc, char** argv)
-{
-  bool force_overwrite = false;
-  bool build_mode      = false;
-  bool search_mode     = false;
-  bool only_check      = false;
-  string index_patterns("*");
-
-  int opt;
-  while ((opt = getopt(argc, argv, "bscfi:h")) != -1) {
-    switch (opt) {
-      case 'b': build_mode = true; break;
-      case 's': search_mode = true; break;
-      case 'c': only_check = true; break;
-      case 'f': force_overwrite = true; break;
-      case 'i': index_patterns = optarg; break;
-      case 'h': cout << usage(argv[0]) << endl; return -1;
-      default: cerr << "\n" << usage(argv[0]) << endl; return -1;
-    }
-  }
-  if (build_mode == search_mode) {
-    cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
-    return -1;
-  }
-  if (argc - optind != 1) {
-    cerr << usage(argv[0]) << endl;
-    return -1;
-  }
-  string conf_file = argv[optind];
-
-  std::ifstream conf_stream(conf_file.c_str());
-  if (!conf_stream) {
-    log_error("can't open configuration file: %s", argv[optind]);
-    return -1;
-  }
-
-  try {
-    Configuration conf(conf_stream);
-
-    auto dataset_conf = conf.get_dataset_conf();
-    BinDataset<data_t> dataset(dataset_conf.name,
-                               dataset_conf.base_file,
-                               dataset_conf.subset_first_row,
-                               dataset_conf.subset_size,
-                               dataset_conf.query_file,
-                               dataset_conf.distance);
-
-    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
-    if (!check(indices, build_mode, force_overwrite)) { return -1; }
-
-    string message = "will ";
-    message += build_mode ? "build:" : "search:";
-    for (const auto& index : indices) {
-      message += "\n  " + index.name;
-    }
-    log_info("%s", message.c_str());
-
-    if (only_check) {
-      log_info("%s", "all check passed, quit due to option -c");
-      return 0;
-    }
-
-    if (build_mode) {
-      build(&dataset, indices);
-    } else if (search_mode) {
-      search(&dataset, indices);
-    }
-  } catch (const std::exception& e) {
-    log_error("exception occurs: %s", e.what());
-    return -1;
-  }
-}
diff --git a/cpp/bench/ann/src/benchmark.cpp b/cpp/bench/ann/src/benchmark.hpp
similarity index 90%
rename from cpp/bench/ann/src/benchmark.cpp
rename to cpp/bench/ann/src/benchmark.hpp
index 674d107efd..c9dd3ce676 100644
--- a/cpp/bench/ann/src/benchmark.cpp
+++ b/cpp/bench/ann/src/benchmark.hpp
@@ -30,9 +30,9 @@
 #include <unordered_set>
 #include <vector>
 
+#include "benchmark_util.hpp"
 #include "conf.h"
 #include "dataset.h"
-#include "factory.h"
 #include "util.h"
 
 using std::cerr;
@@ -43,12 +43,14 @@ using std::to_string;
 using std::unordered_set;
 using std::vector;
 using namespace benchmark;
-using cuann::MemoryType;
+using raft::bench::ann::MemoryType;
+
+namespace raft::bench::ann {
 
 // supported types: float, half (very few implementations support it), uint8_t, int8_t
 using data_t = float;
 
-bool check_file_exist(const vector<string>& files)
+inline bool check_file_exist(const vector<string>& files)
 {
   bool ret = true;
   unordered_set<string> processed;
@@ -62,7 +64,7 @@ bool check_file_exist(const vector<string>& files)
   return ret;
 }
 
-bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
+inline bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
 {
   bool ret = true;
   for (const auto& file : files) {
@@ -78,7 +80,7 @@ bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
   return ret;
 }
 
-bool check_no_duplicate_file(const vector<string>& files)
+inline bool check_no_duplicate_file(const vector<string>& files)
 {
   bool ret = true;
   unordered_set<string> processed;
@@ -92,7 +94,7 @@ bool check_no_duplicate_file(const vector<string>& files)
   return ret;
 }
 
-bool mkdir(const vector<string>& dirs)
+inline bool mkdir(const vector<string>& dirs)
 {
   unordered_set<string> processed;
   for (const auto& dir : dirs) {
@@ -110,7 +112,9 @@ bool mkdir(const vector<string>& dirs)
   return true;
 }
 
-bool check(const vector<Configuration::Index>& indices, bool build_mode, bool force_overwrite)
+inline bool check(const vector<Configuration::Index>& indices,
+                  bool build_mode,
+                  bool force_overwrite)
 {
   vector<string> files_should_exist;
   vector<string> dirs_should_exist;
@@ -144,13 +148,13 @@ bool check(const vector<Configuration::Index>& indices, bool build_mode, bool fo
   return ret;
 }
 
-void write_build_info(const string& file_prefix,
-                      const string& dataset,
-                      const string& distance,
-                      const string& name,
-                      const string& algo,
-                      const string& build_param,
-                      float build_time)
+inline void write_build_info(const string& file_prefix,
+                             const string& dataset,
+                             const string& distance,
+                             const string& name,
+                             const string& algo,
+                             const string& build_param,
+                             float build_time)
 {
   std::ofstream ofs(file_prefix + ".txt");
   if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); }
@@ -225,21 +229,21 @@ void build(const benchmark::Dataset<T>* dataset, const vector<Configuration::Ind
   ANN_CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-void write_search_result(const string& file_prefix,
-                         const string& dataset,
-                         const string& distance,
-                         const string& name,
-                         const string& algo,
-                         const string& build_param,
-                         const string& search_param,
-                         int batch_size,
-                         int run_count,
-                         int k,
-                         float search_time_average,
-                         float search_time_p99,
-                         float search_time_p999,
-                         const int* neighbors,
-                         size_t query_set_size)
+inline void write_search_result(const string& file_prefix,
+                                const string& dataset,
+                                const string& distance,
+                                const string& name,
+                                const string& algo,
+                                const string& build_param,
+                                const string& search_param,
+                                int batch_size,
+                                int run_count,
+                                int k,
+                                float search_time_average,
+                                float search_time_p99,
+                                float search_time_p999,
+                                const int* neighbors,
+                                size_t query_set_size)
 {
   std::ofstream ofs(file_prefix + ".txt");
   if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); }
@@ -271,7 +275,8 @@ void write_search_result(const string& file_prefix,
 }
 
 template <typename T>
-void search(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
+inline void search(const benchmark::Dataset<T>* dataset,
+                   const vector<Configuration::Index>& indices)
 {
   if (indices.empty()) { return; }
   cudaStream_t stream;
@@ -466,7 +471,7 @@ void search(const benchmark::Dataset<T>* dataset, const vector<Configuration::In
   ANN_CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-const string usage(const string& argv0)
+inline const string usage(const string& argv0)
 {
   return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
          "   -b: build mode, will build index\n" +
@@ -481,7 +486,7 @@ const string usage(const string& argv0)
          "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
 }
 
-int main(int argc, char** argv)
+inline int run_main(int argc, char** argv)
 {
   bool force_overwrite = false;
   bool build_mode      = false;
@@ -553,3 +558,4 @@ int main(int argc, char** argv)
     return -1;
   }
 }
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/benchmark_util.hpp b/cpp/bench/ann/src/benchmark_util.hpp
new file mode 100644
index 0000000000..fa848977f9
--- /dev/null
+++ b/cpp/bench/ann/src/benchmark_util.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::unordered_set;
+using std::vector;
+using namespace benchmark;
+using raft::bench::ann::MemoryType;
+
+namespace raft::bench::ann {
+
+inline raft::bench::ann::Metric parse_metric(const std::string& metric_str)
+{
+  if (metric_str == "inner_product") {
+    return raft::bench::ann::Metric::kInnerProduct;
+  } else if (metric_str == "euclidean") {
+    return raft::bench::ann::Metric::kEuclidean;
+  } else {
+    throw std::runtime_error("invalid metric: '" + metric_str + "'");
+  }
+}
+}  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/cudart_util.h b/cpp/bench/ann/src/cudart_util.h
index 1d315ad532..2c590a9523 100644
--- a/cpp/bench/ann/src/cudart_util.h
+++ b/cpp/bench/ann/src/cudart_util.h
@@ -20,21 +20,21 @@
 
 #include <cuda_runtime_api.h>
 
-#define ANN_CUDA_CHECK(call)                        \
-  {                                                 \
-    cuann::cuda_check_((call), __FILE__, __LINE__); \
+#define ANN_CUDA_CHECK(call)                                   \
+  {                                                            \
+    raft::bench::ann::cuda_check_((call), __FILE__, __LINE__); \
   }
 
 #ifndef NDEBUG
-#define ANN_CUDA_CHECK_LAST_ERROR()                    \
-  {                                                    \
-    cuann::cuda_check_last_error_(__FILE__, __LINE__); \
+#define ANN_CUDA_CHECK_LAST_ERROR()                               \
+  {                                                               \
+    raft::bench::ann::cuda_check_last_error_(__FILE__, __LINE__); \
   }
 #else
 #define ANN_CUDA_CHECK_LAST_ERROR()
 #endif
 
-namespace cuann {
+namespace raft::bench::ann {
 
 constexpr unsigned int WARP_FULL_MASK = 0xffffffff;
 constexpr int WARP_SIZE               = 32;
@@ -60,5 +60,5 @@ inline void cuda_check_last_error_(const char* file, int line)
   cuda_check_(err, file, line);
 }
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 #endif
diff --git a/cpp/bench/ann/src/factory.cuh b/cpp/bench/ann/src/factory.cuh
deleted file mode 100644
index 5a6e6fc694..0000000000
--- a/cpp/bench/ann/src/factory.cuh
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef FACTORY_H_
-#define FACTORY_H_
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "ann.h"
-#undef WARP_SIZE
-#ifdef RAFT_ANN_BENCH_USE_FAISS
-#include "faiss_wrapper.h"
-#endif
-#ifdef RAFT_ANN_BENCH_USE_GGNN
-#include "ggnn_wrapper.cuh"
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-#include "raft_wrapper.h"
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-#include "raft_ivf_flat_wrapper.h"
-extern template class cuann::RaftIvfFlatGpu<float, uint64_t>;
-extern template class cuann::RaftIvfFlatGpu<uint8_t, uint64_t>;
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-#include "raft_ivf_pq_wrapper.h"
-extern template class cuann::RaftIvfPQ<float, uint64_t>;
-extern template class cuann::RaftIvfPQ<uint8_t, uint64_t>;
-#endif
-#ifdef RAFT_ANN_BENCH_USE_MULTI_GPU
-#include "multigpu.cuh"
-#endif
-#define JSON_DIAGNOSTICS 1
-#include <nlohmann/json.hpp>
-
-namespace benchmark {
-
-cuann::Metric parse_metric(const std::string& metric_str)
-{
-  if (metric_str == "inner_product") {
-    return cuann::Metric::kInnerProduct;
-  } else if (metric_str == "euclidean") {
-    return cuann::Metric::kEuclidean;
-  } else {
-    throw std::runtime_error("invalid metric: '" + metric_str + "'");
-  }
-}
-
-#ifdef RAFT_ANN_BENCH_USE_FAISS
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename cuann::FaissGpuIVFFlat<T>::BuildParam& param)
-{
-  param.nlist = conf.at("nlist");
-}
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename cuann::FaissGpuIVFPQ<T>::BuildParam& param)
-{
-  param.nlist = conf.at("nlist");
-  param.M     = conf.at("M");
-  if (conf.contains("usePrecomputed")) {
-    param.usePrecomputed = conf.at("usePrecomputed");
-  } else {
-    param.usePrecomputed = false;
-  }
-  if (conf.contains("useFloat16")) {
-    param.useFloat16 = conf.at("useFloat16");
-  } else {
-    param.useFloat16 = false;
-  }
-}
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename cuann::FaissGpuIVFSQ<T>::BuildParam& param)
-{
-  param.nlist          = conf.at("nlist");
-  param.quantizer_type = conf.at("quantizer_type");
-}
-
-template <typename T>
-void parse_search_param(const nlohmann::json& conf, typename cuann::FaissGpu<T>::SearchParam& param)
-{
-  param.nprobe = conf.at("nprobe");
-}
-#endif
-
-#ifdef RAFT_ANN_BENCH_USE_GGNN
-template <typename T>
-void parse_build_param(const nlohmann::json& conf, typename cuann::Ggnn<T>::BuildParam& param)
-{
-  param.dataset_size = conf.at("dataset_size");
-  param.k            = conf.at("k");
-
-  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
-  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
-  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
-  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
-  if (conf.contains("refine_iterations")) {
-    param.refine_iterations = conf.at("refine_iterations");
-  }
-}
-
-template <typename T>
-void parse_search_param(const nlohmann::json& conf, typename cuann::Ggnn<T>::SearchParam& param)
-{
-  param.tau = conf.at("tau");
-
-  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
-  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
-  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
-  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
-}
-#endif
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename cuann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
-{
-  param.n_lists = conf.at("nlist");
-  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) {
-    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
-    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
-  }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename cuann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
-{
-  param.ivf_flat_params.n_probes = conf.at("nprobe");
-}
-#endif
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename cuann::RaftIvfPQ<T, IdxT>::BuildParam& param)
-{
-  param.n_lists = conf.at("nlist");
-  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
-  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
-  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename cuann::RaftIvfPQ<T, IdxT>::SearchParam& param)
-{
-  param.pq_param.n_probes = conf.at("numProbes");
-  if (conf.contains("internalDistanceDtype")) {
-    std::string type = conf.at("internalDistanceDtype");
-    if (type == "float") {
-      param.pq_param.internal_distance_dtype = CUDA_R_32F;
-    } else if (type == "half") {
-      param.pq_param.internal_distance_dtype = CUDA_R_16F;
-    } else {
-      throw std::runtime_error("internalDistanceDtype: '" + type +
-                               "', should be either 'float' or 'half'");
-    }
-  } else {
-    // set half as default type
-    param.pq_param.internal_distance_dtype = CUDA_R_16F;
-  }
-
-  if (conf.contains("smemLutDtype")) {
-    std::string type = conf.at("smemLutDtype");
-    if (type == "float") {
-      param.pq_param.lut_dtype = CUDA_R_32F;
-    } else if (type == "half") {
-      param.pq_param.lut_dtype = CUDA_R_16F;
-    } else if (type == "fp8") {
-      param.pq_param.lut_dtype = CUDA_R_8U;
-    } else {
-      throw std::runtime_error("smemLutDtype: '" + type +
-                               "', should be either 'float', 'half' or 'fp8'");
-    }
-  } else {
-    // set half as default
-    param.pq_param.lut_dtype = CUDA_R_16F;
-  }
-}
-#endif
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric, int dim, const nlohmann::json& conf)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric,
-                                         int dim,
-                                         const nlohmann::json& conf,
-                                         const std::vector<int>& dev_list)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-
-#ifdef RAFT_ANN_BENCH_USE_MULTI_GPU
-  if (dev_list.empty()) {
-    return std::make_unique<Algo<T>>(metric, dim, param);
-  } else {
-    return std::make_unique<cuann::MultiGpuANN<T, Algo<T>>>(metric, dim, param, dev_list);
-  }
-#else
-  (void)dev_list;
-  return std::make_unique<Algo<T>>(metric, dim, param);
-#endif
-}
-
-template <typename T>
-std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
-                                           const std::string& distance,
-                                           int dim,
-                                           float refine_ratio,
-                                           const nlohmann::json& conf,
-                                           const std::vector<int>& dev_list)
-{
-  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
-  (void)dev_list;
-#ifndef RAFT_ANN_BENCH_USE_MULTI_GPU
-  if (!dev_list.empty()) {
-    throw std::runtime_error(
-      "compiled without RAFT_ANN_BENCH_USE_MULTI_GPU, but a device list is given");
-  }
-#endif
-
-  cuann::Metric metric = parse_metric(distance);
-  std::unique_ptr<cuann::ANN<T>> ann;
-
-  if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_ANN_BENCH_USE_FAISS
-    if (algo == "faiss_gpu_ivf_flat") {
-      ann = make_algo<T, cuann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
-    } else if (algo == "faiss_gpu_ivf_pq") {
-      ann = make_algo<T, cuann::FaissGpuIVFPQ>(metric, dim, conf);
-    } else if (algo == "faiss_gpu_ivf_sq") {
-      ann = make_algo<T, cuann::FaissGpuIVFSQ>(metric, dim, conf);
-    } else if (algo == "faiss_gpu_flat") {
-      ann = std::make_unique<cuann::FaissGpuFlat<T>>(metric, dim);
-    }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-    if (algo == "raft_bfknn") { ann = std::make_unique<cuann::RaftGpu<T>>(metric, dim); }
-#endif
-  }
-
-  if constexpr (std::is_same_v<T, uint8_t>) {}
-
-#ifdef RAFT_ANN_BENCH_USE_GGNN
-  if (algo == "ggnn") { ann = make_algo<T, cuann::Ggnn>(metric, dim, conf); }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-  if (algo == "raft_ivf_flat") {
-    typename cuann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
-    parse_build_param<T, uint64_t>(conf, param);
-    ann = std::make_unique<cuann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-  if (algo == "raft_ivf_pq") {
-    typename cuann::RaftIvfPQ<T, uint64_t>::BuildParam param;
-    parse_build_param<T, uint64_t>(conf, param);
-    ann = std::make_unique<cuann::RaftIvfPQ<T, uint64_t>>(metric, dim, param, refine_ratio);
-  }
-#endif
-  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
-
-  if (refine_ratio > 1.0) {}
-  return ann;
-}
-
-template <typename T>
-std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
-  const std::string& algo, const nlohmann::json& conf)
-{
-#ifdef RAFT_ANN_BENCH_USE_FAISS
-  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
-    auto param = std::make_unique<typename cuann::FaissGpu<T>::SearchParam>();
-    parse_search_param<T>(conf, *param);
-    return param;
-  } else if (algo == "faiss_gpu_flat") {
-    auto param = std::make_unique<typename cuann::ANN<T>::AnnSearchParam>();
-    return param;
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-  if (algo == "raft_bfknn") {
-    auto param = std::make_unique<typename cuann::ANN<T>::AnnSearchParam>();
-    return param;
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_GGNN
-  if (algo == "ggnn") {
-    auto param = std::make_unique<typename cuann::Ggnn<T>::SearchParam>();
-    parse_search_param<T>(conf, *param);
-    return param;
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-  if (algo == "raft_ivf_flat") {
-    auto param = std::make_unique<typename cuann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
-    parse_search_param<T, uint64_t>(conf, *param);
-    return param;
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-  if (algo == "raft_ivf_pq") {
-    auto param = std::make_unique<typename cuann::RaftIvfPQ<T, uint64_t>::SearchParam>();
-    parse_search_param<T, uint64_t>(conf, *param);
-    return param;
-  }
-#endif
-  // else
-  throw std::runtime_error("invalid algo: '" + algo + "'");
-}
-
-}  // namespace benchmark
-#endif
diff --git a/cpp/bench/ann/src/faiss_benchmark.cu b/cpp/bench/ann/src/faiss_benchmark.cu
new file mode 100644
index 0000000000..fb4a0bb0aa
--- /dev/null
+++ b/cpp/bench/ann/src/faiss_benchmark.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.hpp"
+#include "faiss_benchmark.cuh"
+
+int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss_benchmark.cuh b/cpp/bench/ann/src/faiss_benchmark.cuh
new file mode 100644
index 0000000000..06fc6ab1e4
--- /dev/null
+++ b/cpp/bench/ann/src/faiss_benchmark.cuh
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "ann.h"
+#undef WARP_SIZE
+#include "faiss_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  param.M     = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.usePrecomputed = conf.at("usePrecomputed");
+  } else {
+    param.usePrecomputed = false;
+  }
+  if (conf.contains("useFloat16")) {
+    param.useFloat16 = conf.at("useFloat16");
+  } else {
+    param.useFloat16 = false;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
+{
+  param.nlist          = conf.at("nlist");
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::FaissGpu<T>::SearchParam& param)
+{
+  param.nprobe = conf.at("nprobe");
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "faiss_gpu_ivf_flat") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
+    } else if (algo == "faiss_gpu_ivf_pq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFPQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_ivf_sq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFSQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_flat") {
+      ann = std::make_unique<raft::bench::ann::FaissGpuFlat<T>>(metric, dim);
+    }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
+    auto param = std::make_unique<typename raft::bench::ann::FaissGpu<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo == "faiss_gpu_flat") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/faiss_wrapper.h b/cpp/bench/ann/src/faiss_wrapper.h
index 803808e29c..fd223cc540 100644
--- a/cpp/bench/ann/src/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss_wrapper.h
@@ -39,11 +39,11 @@
 
 namespace {
 
-faiss::MetricType parse_metric_type(cuann::Metric metric)
+faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric)
 {
-  if (metric == cuann::Metric::kInnerProduct) {
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
     return faiss::METRIC_INNER_PRODUCT;
-  } else if (metric == cuann::Metric::kEuclidean) {
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
     return faiss::METRIC_L2;
   } else {
     throw std::runtime_error("faiss supports only metric type of inner product and L2");
@@ -71,7 +71,7 @@ class OmpSingleThreadScope {
 
 }  // namespace
 
-namespace cuann {
+namespace raft::bench::ann {
 
 template <typename T>
 class FaissGpu : public ANN<T> {
@@ -310,6 +310,6 @@ class FaissGpuFlat : public FaissGpu<T> {
   }
 };
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn_benchmark.cu
new file mode 100644
index 0000000000..f260880644
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn_benchmark.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.hpp"
+#include "ggnn_benchmark.cuh"
+
+int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cuh b/cpp/bench/ann/src/ggnn_benchmark.cuh
new file mode 100644
index 0000000000..4f63e9f3f0
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn_benchmark.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "ann.h"
+#undef WARP_SIZE
+#include "ggnn_wrapper.cuh"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::Ggnn<T>::BuildParam& param)
+{
+  param.dataset_size = conf.at("dataset_size");
+  param.k            = conf.at("k");
+
+  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
+  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
+  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
+  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
+  if (conf.contains("refine_iterations")) {
+    param.refine_iterations = conf.at("refine_iterations");
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::Ggnn<T>::SearchParam& param)
+{
+  param.tau = conf.at("tau");
+
+  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
+  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {}
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (algo == "ggnn") { ann = make_algo<T, raft::bench::ann::Ggnn>(metric, dim, conf); }
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "ggnn") {
+    auto param = std::make_unique<typename raft::bench::ann::Ggnn<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn_wrapper.cuh
index d0ae3d3c1b..c043093f3d 100644
--- a/cpp/bench/ann/src/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn_wrapper.cuh
@@ -23,7 +23,7 @@
 #include "cudart_util.h"
 #include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
 
-namespace cuann {
+namespace raft::bench::ann {
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 class GgnnImpl;
@@ -304,6 +304,6 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
   ANN_CUDA_CHECK(cudaStreamSynchronize(ggnn_device.stream));
 }
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib_benchmark.cpp
new file mode 100644
index 0000000000..9fd0e2b752
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib_benchmark.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hnswlib_benchmark.hpp"
+#include "benchmark.hpp"
+
+int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/factory.h b/cpp/bench/ann/src/hnswlib_benchmark.hpp
similarity index 56%
rename from cpp/bench/ann/src/factory.h
rename to cpp/bench/ann/src/hnswlib_benchmark.hpp
index b4c3c7602d..fa9024c4dd 100644
--- a/cpp/bench/ann/src/factory.h
+++ b/cpp/bench/ann/src/hnswlib_benchmark.hpp
@@ -26,28 +26,26 @@
 
 #include "ann.h"
 #undef WARP_SIZE
-#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
 #include "hnswlib_wrapper.h"
-#endif
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
 
 namespace benchmark {
 
-cuann::Metric parse_metric(const std::string& metric_str)
+raft::bench::ann::Metric parse_metric(const std::string& metric_str)
 {
   if (metric_str == "inner_product") {
-    return cuann::Metric::kInnerProduct;
+    return raft::bench::ann::Metric::kInnerProduct;
   } else if (metric_str == "euclidean") {
-    return cuann::Metric::kEuclidean;
+    return raft::bench::ann::Metric::kEuclidean;
   } else {
     throw std::runtime_error("invalid metric: '" + metric_str + "'");
   }
 }
 
-#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
 template <typename T>
-void parse_build_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::BuildParam& param)
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::HnswLib<T>::BuildParam& param)
 {
   param.ef_construction = conf.at("efConstruction");
   param.M               = conf.at("M");
@@ -55,15 +53,17 @@ void parse_build_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::B
 }
 
 template <typename T>
-void parse_search_param(const nlohmann::json& conf, typename cuann::HnswLib<T>::SearchParam& param)
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::HnswLib<T>::SearchParam& param)
 {
   param.ef = conf.at("ef");
   if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
 }
-#endif
 
 template <typename T, template <typename> class Algo>
-std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric, int dim, const nlohmann::json& conf)
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
 {
   typename Algo<T>::BuildParam param;
   parse_build_param<T>(conf, param);
@@ -71,10 +71,10 @@ std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric, int dim, const nl
 }
 
 template <typename T, template <typename> class Algo>
-std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric,
-                                         int dim,
-                                         const nlohmann::json& conf,
-                                         const std::vector<int>& dev_list)
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
 {
   typename Algo<T>::BuildParam param;
   parse_build_param<T>(conf, param);
@@ -84,29 +84,25 @@ std::unique_ptr<cuann::ANN<T>> make_algo(cuann::Metric metric,
 }
 
 template <typename T>
-std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
-                                           const std::string& distance,
-                                           int dim,
-                                           float refine_ratio,
-                                           const nlohmann::json& conf,
-                                           const std::vector<int>& dev_list)
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
 {
   // stop compiler warning; not all algorithms support multi-GPU so it may not be used
   (void)dev_list;
 
-  cuann::Metric metric = parse_metric(distance);
-  std::unique_ptr<cuann::ANN<T>> ann;
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
 
   if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
-    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
-#endif
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
   }
 
   if constexpr (std::is_same_v<T, uint8_t>) {
-#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
-    if (algo == "hnswlib") { ann = make_algo<T, cuann::HnswLib>(metric, dim, conf); }
-#endif
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
   }
 
   if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
@@ -116,19 +112,16 @@ std::unique_ptr<cuann::ANN<T>> create_algo(const std::string& algo,
 }
 
 template <typename T>
-std::unique_ptr<typename cuann::ANN<T>::AnnSearchParam> create_search_param(
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
   const std::string& algo, const nlohmann::json& conf)
 {
-#ifdef RAFT_ANN_BENCH_USE_HNSWLIB
   if (algo == "hnswlib") {
-    auto param = std::make_unique<typename cuann::HnswLib<T>::SearchParam>();
+    auto param = std::make_unique<typename raft::bench::ann::HnswLib<T>::SearchParam>();
     parse_search_param<T>(conf, *param);
     return param;
   }
-#endif
   // else
   throw std::runtime_error("invalid algo: '" + algo + "'");
 }
 
 }  // namespace benchmark
-#endif
diff --git a/cpp/bench/ann/src/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib_wrapper.h
index c2241253a5..7d064ee6f6 100644
--- a/cpp/bench/ann/src/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib_wrapper.h
@@ -35,7 +35,7 @@
 #include "ann.h"
 #include <hnswlib.h>
 
-namespace cuann {
+namespace raft::bench::ann {
 
 namespace {
 template <typename T>
@@ -328,6 +328,6 @@ void HnswLib<T>::get_search_knn_results_(const T* query,
   }
 }
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/multigpu.cuh b/cpp/bench/ann/src/multigpu.cuh
index 0061298436..1675411a83 100644
--- a/cpp/bench/ann/src/multigpu.cuh
+++ b/cpp/bench/ann/src/multigpu.cuh
@@ -83,7 +83,7 @@ class DeviceRestorer {
 
 }  // namespace
 
-namespace cuann {
+namespace raft::bench::ann {
 
 template <typename T, typename Algo>
 class MultiGpuANN : public ANN<T> {
@@ -510,6 +510,6 @@ void MultiGpuANN<T, Algo>::reset_search_data_placement_(
     arr, from, all_result_size, k, batch_size, dev_cnt_);
 }
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/raft_cuann_utils.h b/cpp/bench/ann/src/raft_ann_bench_utils.h
similarity index 83%
rename from cpp/bench/ann/src/raft_cuann_utils.h
rename to cpp/bench/ann/src/raft_ann_bench_utils.h
index 0e3e78cad3..fa8cc4b824 100644
--- a/cpp/bench/ann/src/raft_cuann_utils.h
+++ b/cpp/bench/ann/src/raft_ann_bench_utils.h
@@ -31,19 +31,19 @@
 #include <string>
 #include <type_traits>
 
-namespace cuann {
+namespace raft::bench::ann {
 
-inline raft::distance::DistanceType parse_metric_type(cuann::Metric metric)
+inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
 {
-  if (metric == cuann::Metric::kInnerProduct) {
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
     return raft::distance::DistanceType::InnerProduct;
-  } else if (metric == cuann::Metric::kEuclidean) {
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
     // Even for L2 expanded RAFT IVF Flat uses unexpanded formula
     return raft::distance::DistanceType::L2Expanded;
   } else {
     throw std::runtime_error("raft supports only metric type of inner product and L2");
   }
 }
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft_benchmark.cu b/cpp/bench/ann/src/raft_benchmark.cu
new file mode 100644
index 0000000000..30bcec7a09
--- /dev/null
+++ b/cpp/bench/ann/src/raft_benchmark.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.hpp"
+#include "raft_benchmark.cuh"
+
+int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft_benchmark.cuh b/cpp/bench/ann/src/raft_benchmark.cuh
new file mode 100644
index 0000000000..55e985e942
--- /dev/null
+++ b/cpp/bench/ann/src/raft_benchmark.cuh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "ann.h"
+#undef WARP_SIZE
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#include "raft_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+#include "raft_ivf_flat_wrapper.h"
+extern template class raft::bench::ann::RaftIvfFlatGpu<float, uint64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, uint64_t>;
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+#include "raft_ivf_pq_wrapper.h"
+extern template class raft::bench::ann::RaftIvfPQ<float, uint64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<uint8_t, uint64_t>;
+#endif
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
+    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
+{
+  param.pq_param.n_probes = conf.at("numProbes");
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+}
+#endif
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+    if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
+    parse_build_param<T, uint64_t>(conf, param);
+    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::BuildParam param;
+    parse_build_param<T, uint64_t>(conf, param);
+    ann =
+      std::make_unique<raft::bench::ann::RaftIvfPQ<T, uint64_t>>(metric, dim, param, refine_ratio);
+  }
+#endif
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+  if (algo == "raft_bfknn") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    auto param =
+      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
+    parse_search_param<T, uint64_t>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::SearchParam>();
+    parse_search_param<T, uint64_t>(conf, *param);
+    return param;
+  }
+#endif
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft_ivf_flat.cu b/cpp/bench/ann/src/raft_ivf_flat.cu
index 80b4f279cf..8c31652186 100644
--- a/cpp/bench/ann/src/raft_ivf_flat.cu
+++ b/cpp/bench/ann/src/raft_ivf_flat.cu
@@ -15,7 +15,7 @@
  */
 #include "raft_ivf_flat_wrapper.h"
 
-namespace cuann {
+namespace raft::bench::ann {
 template class RaftIvfFlatGpu<float, uint64_t>;
 template class RaftIvfFlatGpu<uint8_t, uint64_t>;
-}  // namespace cuann
\ No newline at end of file
+}  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
index e1f57d3c22..a740fb83ef 100644
--- a/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
@@ -37,9 +37,9 @@
 
 #include "ann.h"
 #include "cudart_util.h"
-#include "raft_cuann_utils.h"
+#include "raft_ann_bench_utils.h"
 
-namespace cuann {
+namespace raft::bench::ann {
 
 template <typename T, typename IdxT>
 class RaftIvfFlatGpu : public ANN<T> {
@@ -142,5 +142,5 @@ void RaftIvfFlatGpu<T, IdxT>::search(
   handle_.sync_stream();
   return;
 }
-}  // namespace cuann
+}  // namespace raft::bench::ann
 #endif
diff --git a/cpp/bench/ann/src/raft_ivf_pq.cu b/cpp/bench/ann/src/raft_ivf_pq.cu
index 0369edfc56..2de81545aa 100644
--- a/cpp/bench/ann/src/raft_ivf_pq.cu
+++ b/cpp/bench/ann/src/raft_ivf_pq.cu
@@ -15,7 +15,7 @@
  */
 #include "raft_ivf_pq_wrapper.h"
 
-namespace cuann {
+namespace raft::bench::ann {
 template class RaftIvfPQ<float, uint64_t>;
 template class RaftIvfPQ<uint8_t, uint64_t>;
-}  // namespace cuann
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
index 0611b291d5..8e026819cb 100644
--- a/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
@@ -35,9 +35,9 @@
 
 #include "ann.h"
 #include "cudart_util.h"
-#include "raft_cuann_utils.h"
+#include "raft_ann_bench_utils.h"
 
-namespace cuann {
+namespace raft::bench::ann {
 
 template <typename T, typename IdxT>
 class RaftIvfPQ : public ANN<T> {
@@ -220,6 +220,6 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
   handle_.sync_stream();
   return;
 }
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/raft_wrapper.h b/cpp/bench/ann/src/raft_wrapper.h
index 01f6d4e4fe..5843ba508e 100644
--- a/cpp/bench/ann/src/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft_wrapper.h
@@ -29,11 +29,11 @@
 
 namespace raft_temp {
 
-inline raft::distance::DistanceType parse_metric_type(cuann::Metric metric)
+inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
 {
-  if (metric == cuann::Metric::kInnerProduct) {
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
     return raft::distance::DistanceType::InnerProduct;
-  } else if (metric == cuann::Metric::kEuclidean) {
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
     return raft::distance::DistanceType::L2Expanded;
   } else {
     throw std::runtime_error("raft supports only metric type of inner product and L2");
@@ -42,7 +42,7 @@ inline raft::distance::DistanceType parse_metric_type(cuann::Metric metric)
 
 }  // namespace raft_temp
 
-namespace cuann {
+namespace raft::bench::ann {
 
 // brute force fused L2 KNN - RAFT
 template <typename T>
@@ -150,6 +150,6 @@ void RaftGpu<T>::search(const T* queries,
                                          metric_type_);
 }
 
-}  // namespace cuann
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md
index ac96c0d0cc..9ed9d2ffa1 100644
--- a/docs/source/cuda_ann_benchmarks.md
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -30,7 +30,7 @@ Available targets to use with `--limit-bench-ann` are:
 - RAFT_IVF_FLAT_ANN_BENCH
 - RAFT_BFKNN_ANN_BENCH
 
-By default, the `*_ANN_BENCH` executables program accept dataset of `float` type. To use other type, change the line `using data_t = float;` in `bench/ann/src/benchmark.cu` (or `bench/ann/src/benchmark/cpp` if benchmarking a non-CUDA algorithm) to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
+By default, the `*_ANN_BENCH` executables program accept dataset of `float` type. To use other type, change the line `using data_t = float;` in `cpp/bench/ann/src/benchmark.cu` (or `cpp/bench/ann/src/benchmark/cpp` if benchmarking a non-CUDA algorithm) to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
 
 
 ### Usage
@@ -46,7 +46,7 @@ A complete example (run from the RAFT source code root directory):
 # (1) prepare a dataset
 pip3 install numpy h5py # if they have not been installed already
 pushd
-cd bench/ann
+cd cpp/bench/ann
 mkdir data && cd data
 wget http://ann-benchmarks.com/glove-100-angular.hdf5
 # option -n is used here to normalize vectors so cosine distance is converted
@@ -67,7 +67,7 @@ popd
 
 # (4) evaluate result
 pushd
-cd bench/ann
+cd cpp/bench/ann
 ./scripts/eval.pl \
   -o result.csv \
   data/glove-100-inner/groundtruth.neighbors.ibin \
@@ -89,13 +89,13 @@ Some implementation can take `float16` database and query vectors as inputs and
 Commonly used datasets can be downloaded from two websites:
 1.  Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
 
-    However, these datasets are in HDF5 format. Use `bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
     ```
     pip3 install numpy h5py
     ```
     The usage of this script is:
     ```
-    $ bench/ann/scripts/hdf5_to_fbin.py
+    $ cpp/bench/ann/scripts/hdf5_to_fbin.py
     usage: scripts/hdf5_to_fbin.py [-n] <input>.hdf5
        -n: normalize base/query set
      outputs: <input>.base.fbin
@@ -110,13 +110,13 @@ Commonly used datasets can be downloaded from two websites:
 
 2.  Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
     ```
-    $ bench/ann/scripts/split_groundtruth.pl
+    $ cpp/bench/ann/scripts/split_groundtruth.pl
     usage: script/split_groundtruth.pl input output_prefix
     ```
     Take Deep-1B dataset as an example:
     ```
     pushd
-    cd bench/ann
+    cd cpp/bench/ann
     mkdir -p data/deep-1B && cd data/deep-1B
     # download manually "Ground Truth" file of "Yandex DEEP"
     # suppose the file name is deep_new_groundtruth.public.10K.bin
@@ -130,7 +130,7 @@ Commonly used datasets can be downloaded from two websites:
 #### step 2: building index
 An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk.
 
-To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`bench/ann/conf/glove-100-inner.json`](../../cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
+To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
 * `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
     - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset.
 * `search_basic_param` section specifies basic parameters for searching:
@@ -206,9 +206,9 @@ Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the
 
 
 #### step 4: evaluating results
-Use `bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is:
+Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is:
 ```
-$ bench/ann/scripts/eval.pl
+$ cpp/bench/ann/scripts/eval.pl
 usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
   result_paths... are paths to the search result files.
     Can specify multiple paths.
@@ -221,7 +221,7 @@ usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
 Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files.
 An example:
 ```
-bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \
+cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \
   result/glove-100-angular/10/hnsw/angular_M_24_*.txt \
   result/glove-100-angular/10/faiss/
 ```
@@ -234,7 +234,7 @@ It saves recall value in result txt file, so avoids to recompute recall if the s
 
 
 ## How to add a new ANN algorithm
-Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `bench/ann/src/ann.h`) and implements all the pure virtual functions.
+Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions.
 
 In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
 ```
@@ -273,7 +273,7 @@ The benchmark program uses JSON configuration file. To add the new algorithm to
 },
 ```
 
-How to interpret these JSON objects is totally left to the implementation and should be specified in `bench/ann/src/factory.cuh`:
+How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`:
 * First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
     ```
     template<typename T>

From 9ca12298efb3849132e1977c32a7d03aa03504d9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 28 Feb 2023 13:57:59 -0500
Subject: [PATCH 09/39] Renaming namespace benchmark -> raft::bench::ann
 everywhere

---
 cpp/bench/ann/src/benchmark.hpp         | 83 ++++++++++++-------------
 cpp/bench/ann/src/benchmark_util.hpp    | 10 ---
 cpp/bench/ann/src/conf.cpp              |  4 +-
 cpp/bench/ann/src/conf.h                |  4 +-
 cpp/bench/ann/src/dataset.h             |  4 +-
 cpp/bench/ann/src/faiss_benchmark.cu    |  2 +-
 cpp/bench/ann/src/ggnn_benchmark.cu     |  2 +-
 cpp/bench/ann/src/hnswlib_benchmark.cpp |  2 +-
 python/pylibraft/pylibraft/__init__.py  |  2 +-
 python/pylibraft/setup.cfg              |  2 +-
 python/raft-dask/raft_dask/__init__.py  |  2 +-
 setup.cfg                               |  2 +-
 12 files changed, 53 insertions(+), 66 deletions(-)

diff --git a/cpp/bench/ann/src/benchmark.hpp b/cpp/bench/ann/src/benchmark.hpp
index c9dd3ce676..4b8a9f96f1 100644
--- a/cpp/bench/ann/src/benchmark.hpp
+++ b/cpp/bench/ann/src/benchmark.hpp
@@ -42,18 +42,16 @@ using std::string;
 using std::to_string;
 using std::unordered_set;
 using std::vector;
-using namespace benchmark;
-using raft::bench::ann::MemoryType;
 
 namespace raft::bench::ann {
 
 // supported types: float, half (very few implementations support it), uint8_t, int8_t
 using data_t = float;
 
-inline bool check_file_exist(const vector<string>& files)
+inline bool check_file_exist(const std::vector<string>& files)
 {
   bool ret = true;
-  unordered_set<string> processed;
+  std::unordered_set<std::string> processed;
   for (const auto& file : files) {
     if (processed.find(file) == processed.end() && !file_exists(file)) {
       log_error("file '%s' doesn't exist or is not a regular file", file.c_str());
@@ -64,7 +62,7 @@ inline bool check_file_exist(const vector<string>& files)
   return ret;
 }
 
-inline bool check_file_not_exist(const vector<string>& files, bool force_overwrite)
+inline bool check_file_not_exist(const std::vector<std::string>& files, bool force_overwrite)
 {
   bool ret = true;
   for (const auto& file : files) {
@@ -80,10 +78,10 @@ inline bool check_file_not_exist(const vector<string>& files, bool force_overwri
   return ret;
 }
 
-inline bool check_no_duplicate_file(const vector<string>& files)
+inline bool check_no_duplicate_file(const std::vector<std::string>& files)
 {
   bool ret = true;
-  unordered_set<string> processed;
+  std::unordered_set<string> processed;
   for (const auto& file : files) {
     if (processed.find(file) != processed.end()) {
       log_error("'%s' occurs more than once as output file, would be overwritten", file.c_str());
@@ -94,9 +92,9 @@ inline bool check_no_duplicate_file(const vector<string>& files)
   return ret;
 }
 
-inline bool mkdir(const vector<string>& dirs)
+inline bool mkdir(const std::vector<std::string>& dirs)
 {
-  unordered_set<string> processed;
+  std::unordered_set<string> processed;
   for (const auto& dir : dirs) {
     if (processed.find(dir) == processed.end() && !dir_exists(dir)) {
       if (create_dir(dir)) {
@@ -112,20 +110,20 @@ inline bool mkdir(const vector<string>& dirs)
   return true;
 }
 
-inline bool check(const vector<Configuration::Index>& indices,
+inline bool check(const std::vectorConfiguration::Index > &indices,
                   bool build_mode,
                   bool force_overwrite)
 {
-  vector<string> files_should_exist;
-  vector<string> dirs_should_exist;
-  vector<string> output_files;
+  std::vector<std::string> files_should_exist;
+  std::vector<std::string> dirs_should_exist;
+  std::vector<std::string> output_files;
   for (const auto& index : indices) {
     if (build_mode) {
       output_files.push_back(index.file);
       output_files.push_back(index.file + ".txt");
 
       auto pos = index.file.rfind('/');
-      if (pos != string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
+      if (pos != std::string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
     } else {
       files_should_exist.push_back(index.file);
       files_should_exist.push_back(index.file + ".txt");
@@ -134,7 +132,7 @@ inline bool check(const vector<Configuration::Index>& indices,
       output_files.push_back(index.search_result_file + ".0.txt");
 
       auto pos = index.search_result_file.rfind('/');
-      if (pos != string::npos) {
+      if (pos != std::string::npos) {
         dirs_should_exist.push_back(index.search_result_file.substr(0, pos));
       }
     }
@@ -148,12 +146,12 @@ inline bool check(const vector<Configuration::Index>& indices,
   return ret;
 }
 
-inline void write_build_info(const string& file_prefix,
-                             const string& dataset,
-                             const string& distance,
-                             const string& name,
-                             const string& algo,
-                             const string& build_param,
+inline void write_build_info(const std::string& file_prefix,
+                             const std::string& dataset,
+                             const std::string& distance,
+                             const std::string& name,
+                             const std::string& algo,
+                             const std::string& build_param,
                              float build_time)
 {
   std::ofstream ofs(file_prefix + ".txt");
@@ -170,7 +168,7 @@ inline void write_build_info(const string& file_prefix,
 }
 
 template <typename T>
-void build(const benchmark::Dataset<T>* dataset, const vector<Configuration::Index>& indices)
+void build(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
 {
   cudaStream_t stream;
   ANN_CUDA_CHECK(cudaStreamCreate(&stream));
@@ -229,13 +227,13 @@ void build(const benchmark::Dataset<T>* dataset, const vector<Configuration::Ind
   ANN_CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-inline void write_search_result(const string& file_prefix,
-                                const string& dataset,
-                                const string& distance,
-                                const string& name,
-                                const string& algo,
-                                const string& build_param,
-                                const string& search_param,
+inline void write_search_result(const std::string& file_prefix,
+                                const std::string& dataset,
+                                const std::string& distance,
+                                const std::string& name,
+                                const std::string& algo,
+                                const std::string& build_param,
+                                const std::string& search_param,
                                 int batch_size,
                                 int run_count,
                                 int k,
@@ -275,8 +273,7 @@ inline void write_search_result(const string& file_prefix,
 }
 
 template <typename T>
-inline void search(const benchmark::Dataset<T>* dataset,
-                   const vector<Configuration::Index>& indices)
+inline void search(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
 {
   if (indices.empty()) { return; }
   cudaStream_t stream;
@@ -303,12 +300,12 @@ inline void search(const benchmark::Dataset<T>* dataset,
              query_set_size % batch_size);
   }
   const size_t num_batches = (query_set_size - 1) / batch_size + 1;
-  size_t* neighbors        = new size_t[query_set_size * k];
+  std::size_t* neighbors   = new std::size_t[query_set_size * k];
   int* neighbors_buf       = new int[query_set_size * k];
   float* distances         = new float[query_set_size * k];
-  vector<float> search_times;
+  std::vector<float> search_times;
   search_times.reserve(num_batches);
-  size_t* d_neighbors;
+  std::size_t* d_neighbors;
   float* d_distances;
   ANN_CUDA_CHECK(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
   ANN_CUDA_CHECK(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
@@ -326,9 +323,9 @@ inline void search(const benchmark::Dataset<T>* dataset,
     log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str());
     algo->load(index.file);
 
-    const T* this_query_set = query_set;
-    size_t* this_neighbors  = neighbors;
-    float* this_distances   = distances;
+    const T* this_query_set     = query_set;
+    std::size_t* this_neighbors = neighbors;
+    float* this_distances       = distances;
     if (algo_property.query_memory_type == MemoryType::Device) {
       this_query_set = d_query_set;
       this_neighbors = d_neighbors;
@@ -371,8 +368,8 @@ inline void search(const benchmark::Dataset<T>* dataset,
       float best_search_time_p999    = std::numeric_limits<float>::max();
       for (int run = 0; run < run_count; ++run) {
         log_info("run %d / %d", run + 1, run_count);
-        for (size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
-          size_t row            = batch_id * batch_size;
+        for (std::size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
+          std::size_t row       = batch_id * batch_size;
           int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
           ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
 #ifdef NVTX
@@ -471,7 +468,7 @@ inline void search(const benchmark::Dataset<T>* dataset,
   ANN_CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-inline const string usage(const string& argv0)
+inline const std::string usage(const string& argv0)
 {
   return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
          "   -b: build mode, will build index\n" +
@@ -507,11 +504,11 @@ inline int run_main(int argc, char** argv)
     }
   }
   if (build_mode == search_mode) {
-    cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
+    std::cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
     return -1;
   }
   if (argc - optind != 1) {
-    cerr << usage(argv[0]) << endl;
+    std::cerr << usage(argv[0]) << endl;
     return -1;
   }
   string conf_file = argv[optind];
@@ -536,7 +533,7 @@ inline int run_main(int argc, char** argv)
     vector<Configuration::Index> indices = conf.get_indices(index_patterns);
     if (!check(indices, build_mode, force_overwrite)) { return -1; }
 
-    string message = "will ";
+    std::string message = "will ";
     message += build_mode ? "build:" : "search:";
     for (const auto& index : indices) {
       message += "\n  " + index.name;
diff --git a/cpp/bench/ann/src/benchmark_util.hpp b/cpp/bench/ann/src/benchmark_util.hpp
index fa848977f9..13a061e444 100644
--- a/cpp/bench/ann/src/benchmark_util.hpp
+++ b/cpp/bench/ann/src/benchmark_util.hpp
@@ -17,16 +17,6 @@
 
 #include <string>
 
-using std::cerr;
-using std::cout;
-using std::endl;
-using std::string;
-using std::to_string;
-using std::unordered_set;
-using std::vector;
-using namespace benchmark;
-using raft::bench::ann::MemoryType;
-
 namespace raft::bench::ann {
 
 inline raft::bench::ann::Metric parse_metric(const std::string& metric_str)
diff --git a/cpp/bench/ann/src/conf.cpp b/cpp/bench/ann/src/conf.cpp
index 90b164c076..66a8d252b4 100644
--- a/cpp/bench/ann/src/conf.cpp
+++ b/cpp/bench/ann/src/conf.cpp
@@ -23,7 +23,7 @@
 
 #include "util.h"
 
-namespace benchmark {
+namespace raft::bench::ann {
 using std::runtime_error;
 using std::string;
 using std::unordered_set;
@@ -133,4 +133,4 @@ unordered_set<string> Configuration::match_(const vector<string>& candidates,
   return matched;
 }
 
-}  // namespace benchmark
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/conf.h b/cpp/bench/ann/src/conf.h
index 987f2d52aa..cdf8968d8d 100644
--- a/cpp/bench/ann/src/conf.h
+++ b/cpp/bench/ann/src/conf.h
@@ -24,7 +24,7 @@
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
 
-namespace benchmark {
+namespace raft::bench::ann {
 
 class Configuration {
  public:
@@ -71,6 +71,6 @@ class Configuration {
   std::vector<Index> indices_;
 };
 
-}  // namespace benchmark
+}  // namespace raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/dataset.h b/cpp/bench/ann/src/dataset.h
index b756b204d4..e427a2c9dd 100644
--- a/cpp/bench/ann/src/dataset.h
+++ b/cpp/bench/ann/src/dataset.h
@@ -31,7 +31,7 @@
 
 #include "cudart_util.h"
 
-namespace benchmark {
+namespace raft::bench::ann {
 
 // http://big-ann-benchmarks.com/index.html:
 // binary format that starts with 8 bytes of data consisting of num_points(uint32_t)
@@ -380,6 +380,6 @@ void BinDataset<T>::map_base_set_() const
   this->mapped_base_set_ = reinterpret_cast<T*>(original_map_ptr + subset_offset_);
 }
 
-}  // namespace benchmark
+}  // namespace  raft::bench::ann
 
 #endif
diff --git a/cpp/bench/ann/src/faiss_benchmark.cu b/cpp/bench/ann/src/faiss_benchmark.cu
index fb4a0bb0aa..3605c190c7 100644
--- a/cpp/bench/ann/src/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss_benchmark.cu
@@ -16,4 +16,4 @@
 #include "benchmark.hpp"
 #include "faiss_benchmark.cuh"
 
-int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
+int run_main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn_benchmark.cu
index f260880644..2978566c23 100644
--- a/cpp/bench/ann/src/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn_benchmark.cu
@@ -16,4 +16,4 @@
 #include "benchmark.hpp"
 #include "ggnn_benchmark.cuh"
 
-int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
+int run_main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib_benchmark.cpp
index 9fd0e2b752..72625b780d 100644
--- a/cpp/bench/ann/src/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib_benchmark.cpp
@@ -16,4 +16,4 @@
 #include "hnswlib_benchmark.hpp"
 #include "benchmark.hpp"
 
-int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
+int run_main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index ab979429d2..39145085f0 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
index 8da78d2d86..7d1a0c9065 100644
--- a/python/pylibraft/setup.cfg
+++ b/python/pylibraft/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 [isort]
 line_length=79
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 76e096c8f2..4f4700df48 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/setup.cfg b/setup.cfg
index 9d7f789391..e64641d05b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 [flake8]
 filename = *.py, *.pyx, *.pxd, *.pxi

From 6b8e25aff75bdd54d231cf496e9038b0e4cc2a96 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 28 Feb 2023 15:18:43 -0500
Subject: [PATCH 10/39] Getting new stuff to build. Now to test it...

---
 cpp/bench/ann/src/{ann.h => ann.hpp}      |  7 +++----
 cpp/bench/ann/src/benchmark.hpp           |  8 +++++---
 cpp/bench/ann/src/benchmark_util.hpp      |  5 +++--
 cpp/bench/ann/src/faiss_benchmark.cu      |  2 +-
 cpp/bench/ann/src/faiss_benchmark.cuh     |  2 +-
 cpp/bench/ann/src/faiss_wrapper.h         |  2 +-
 cpp/bench/ann/src/ggnn_benchmark.cu       |  2 +-
 cpp/bench/ann/src/ggnn_benchmark.cuh      |  2 +-
 cpp/bench/ann/src/ggnn_wrapper.cuh        |  2 +-
 cpp/bench/ann/src/hnswlib_benchmark.cpp   |  2 +-
 cpp/bench/ann/src/hnswlib_benchmark.hpp   | 23 +++++++----------------
 cpp/bench/ann/src/hnswlib_wrapper.h       | 12 +++---------
 cpp/bench/ann/src/multigpu.cuh            |  2 +-
 cpp/bench/ann/src/raft_benchmark.cu       |  2 +-
 cpp/bench/ann/src/raft_benchmark.cuh      |  2 +-
 cpp/bench/ann/src/raft_ivf_flat_wrapper.h |  2 +-
 cpp/bench/ann/src/raft_ivf_pq_wrapper.h   |  2 +-
 cpp/bench/ann/src/raft_wrapper.h          |  2 +-
 cpp/bench/ann/src/util.cpp                |  4 ++--
 cpp/bench/ann/src/util.h                  |  4 ++--
 20 files changed, 38 insertions(+), 51 deletions(-)
 rename cpp/bench/ann/src/{ann.h => ann.hpp} (98%)

diff --git a/cpp/bench/ann/src/ann.h b/cpp/bench/ann/src/ann.hpp
similarity index 98%
rename from cpp/bench/ann/src/ann.h
rename to cpp/bench/ann/src/ann.hpp
index 105688228d..8f73896e07 100644
--- a/cpp/bench/ann/src/ann.h
+++ b/cpp/bench/ann/src/ann.hpp
@@ -1,5 +1,4 @@
-#ifndef ANN_H_
-#define ANN_H_
+
 
 /*
  * Copyright (c) 2023, NVIDIA CORPORATION.
@@ -16,6 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#pragma once
 #include <string>
 #include <vector>
 
@@ -85,5 +86,3 @@ class ANN {
 };
 
 }  // namespace raft::bench::ann
-
-#endif  // ANN_H_
diff --git a/cpp/bench/ann/src/benchmark.hpp b/cpp/bench/ann/src/benchmark.hpp
index 4b8a9f96f1..7ba40512a7 100644
--- a/cpp/bench/ann/src/benchmark.hpp
+++ b/cpp/bench/ann/src/benchmark.hpp
@@ -110,7 +110,7 @@ inline bool mkdir(const std::vector<std::string>& dirs)
   return true;
 }
 
-inline bool check(const std::vectorConfiguration::Index > &indices,
+inline bool check(const std::vector<Configuration::Index>& indices,
                   bool build_mode,
                   bool force_overwrite)
 {
@@ -489,7 +489,7 @@ inline int run_main(int argc, char** argv)
   bool build_mode      = false;
   bool search_mode     = false;
   bool only_check      = false;
-  string index_patterns("*");
+  std::string index_patterns("*");
 
   int opt;
   while ((opt = getopt(argc, argv, "bscfi:h")) != -1) {
@@ -554,5 +554,7 @@ inline int run_main(int argc, char** argv)
     log_error("exception occurs: %s", e.what());
     return -1;
   }
+
+  return 0;
 }
-}  // namespace raft::bench::ann
+};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/benchmark_util.hpp b/cpp/bench/ann/src/benchmark_util.hpp
index 13a061e444..52bd193042 100644
--- a/cpp/bench/ann/src/benchmark_util.hpp
+++ b/cpp/bench/ann/src/benchmark_util.hpp
@@ -15,11 +15,12 @@
  */
 #pragma once
 
+#include "ann.hpp"
 #include <string>
 
 namespace raft::bench::ann {
 
-inline raft::bench::ann::Metric parse_metric(const std::string& metric_str)
+inline Metric parse_metric(const std::string& metric_str)
 {
   if (metric_str == "inner_product") {
     return raft::bench::ann::Metric::kInnerProduct;
@@ -29,4 +30,4 @@ inline raft::bench::ann::Metric parse_metric(const std::string& metric_str)
     throw std::runtime_error("invalid metric: '" + metric_str + "'");
   }
 }
-}  // namespace raft::bench::ann
\ No newline at end of file
+};  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss_benchmark.cu b/cpp/bench/ann/src/faiss_benchmark.cu
index 3605c190c7..5370f56e45 100644
--- a/cpp/bench/ann/src/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss_benchmark.cu
@@ -16,4 +16,4 @@
 #include "benchmark.hpp"
 #include "faiss_benchmark.cuh"
 
-int run_main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss_benchmark.cuh b/cpp/bench/ann/src/faiss_benchmark.cuh
index 06fc6ab1e4..a8e8acc6f9 100644
--- a/cpp/bench/ann/src/faiss_benchmark.cuh
+++ b/cpp/bench/ann/src/faiss_benchmark.cuh
@@ -24,7 +24,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.h"
+#include "ann.hpp"
 #undef WARP_SIZE
 #include "faiss_wrapper.h"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/faiss_wrapper.h b/cpp/bench/ann/src/faiss_wrapper.h
index fd223cc540..b3ce5ca675 100644
--- a/cpp/bench/ann/src/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss_wrapper.h
@@ -35,7 +35,7 @@
 #include <string>
 #include <type_traits>
 
-#include "ann.h"
+#include "ann.hpp"
 
 namespace {
 
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn_benchmark.cu
index 2978566c23..6aa6ac0ec0 100644
--- a/cpp/bench/ann/src/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn_benchmark.cu
@@ -16,4 +16,4 @@
 #include "benchmark.hpp"
 #include "ggnn_benchmark.cuh"
 
-int run_main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cuh b/cpp/bench/ann/src/ggnn_benchmark.cuh
index 4f63e9f3f0..d23fb10d3e 100644
--- a/cpp/bench/ann/src/ggnn_benchmark.cuh
+++ b/cpp/bench/ann/src/ggnn_benchmark.cuh
@@ -23,7 +23,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.h"
+#include "ann.hpp"
 #undef WARP_SIZE
 #include "ggnn_wrapper.cuh"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn_wrapper.cuh
index c043093f3d..de61094d27 100644
--- a/cpp/bench/ann/src/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn_wrapper.cuh
@@ -19,7 +19,7 @@
 #include <memory>
 #include <stdexcept>
 
-#include "ann.h"
+#include "ann.hpp"
 #include "cudart_util.h"
 #include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
 
diff --git a/cpp/bench/ann/src/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib_benchmark.cpp
index 72625b780d..ecf29677ac 100644
--- a/cpp/bench/ann/src/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib_benchmark.cpp
@@ -16,4 +16,4 @@
 #include "hnswlib_benchmark.hpp"
 #include "benchmark.hpp"
 
-int run_main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib_benchmark.hpp b/cpp/bench/ann/src/hnswlib_benchmark.hpp
index fa9024c4dd..5e59a88f9b 100644
--- a/cpp/bench/ann/src/hnswlib_benchmark.hpp
+++ b/cpp/bench/ann/src/hnswlib_benchmark.hpp
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef FACTORY_H_
-#define FACTORY_H_
+
+#pragma once
 
 #include <algorithm>
 #include <cmath>
@@ -24,24 +24,15 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.h"
+#include "benchmark_util.hpp"
+
+#include "ann.hpp"
 #undef WARP_SIZE
 #include "hnswlib_wrapper.h"
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
 
-namespace benchmark {
-
-raft::bench::ann::Metric parse_metric(const std::string& metric_str)
-{
-  if (metric_str == "inner_product") {
-    return raft::bench::ann::Metric::kInnerProduct;
-  } else if (metric_str == "euclidean") {
-    return raft::bench::ann::Metric::kEuclidean;
-  } else {
-    throw std::runtime_error("invalid metric: '" + metric_str + "'");
-  }
-}
+namespace raft::bench::ann {
 
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
@@ -124,4 +115,4 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
   throw std::runtime_error("invalid algo: '" + algo + "'");
 }
 
-}  // namespace benchmark
+};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib_wrapper.h
index 7d064ee6f6..867199723b 100644
--- a/cpp/bench/ann/src/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib_wrapper.h
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef HNSWLIB_WRAPPER_H_
-#define HNSWLIB_WRAPPER_H_
+#pragma once
 
 #include <algorithm>
 #include <atomic>
@@ -32,12 +31,11 @@
 #include <utility>
 #include <vector>
 
-#include "ann.h"
+#include "ann.hpp"
 #include <hnswlib.h>
 
 namespace raft::bench::ann {
 
-namespace {
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -159,8 +157,6 @@ class FixedThreadPool {
   std::atomic<bool> finished_{false};
 };
 
-}  // namespace
-
 template <typename T>
 class HnswLib : public ANN<T> {
  public:
@@ -328,6 +324,4 @@ void HnswLib<T>::get_search_knn_results_(const T* query,
   }
 }
 
-}  // namespace raft::bench::ann
-
-#endif
+};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/multigpu.cuh b/cpp/bench/ann/src/multigpu.cuh
index 1675411a83..4cbf108f5c 100644
--- a/cpp/bench/ann/src/multigpu.cuh
+++ b/cpp/bench/ann/src/multigpu.cuh
@@ -27,7 +27,7 @@
 #include <type_traits>
 #include <vector>
 
-#include "ann.h"
+#include "ann.hpp"
 #include "cudart_util.h"
 
 #define NCCLCHECK(cmd)                                                                      \
diff --git a/cpp/bench/ann/src/raft_benchmark.cu b/cpp/bench/ann/src/raft_benchmark.cu
index 30bcec7a09..dc34ae2401 100644
--- a/cpp/bench/ann/src/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft_benchmark.cu
@@ -16,4 +16,4 @@
 #include "benchmark.hpp"
 #include "raft_benchmark.cuh"
 
-int run_main(int argc, char** argv) { return run_main(argc, argv); }
\ No newline at end of file
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft_benchmark.cuh b/cpp/bench/ann/src/raft_benchmark.cuh
index 55e985e942..1eb195537b 100644
--- a/cpp/bench/ann/src/raft_benchmark.cuh
+++ b/cpp/bench/ann/src/raft_benchmark.cuh
@@ -22,7 +22,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.h"
+#include "ann.hpp"
 #undef WARP_SIZE
 #ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
 #include "raft_wrapper.h"
diff --git a/cpp/bench/ann/src/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
index a740fb83ef..c17303b5c1 100644
--- a/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
@@ -35,7 +35,7 @@
 #include <string>
 #include <type_traits>
 
-#include "ann.h"
+#include "ann.hpp"
 #include "cudart_util.h"
 #include "raft_ann_bench_utils.h"
 
diff --git a/cpp/bench/ann/src/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
index 8e026819cb..f775b082d2 100644
--- a/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
@@ -33,7 +33,7 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <type_traits>
 
-#include "ann.h"
+#include "ann.hpp"
 #include "cudart_util.h"
 #include "raft_ann_bench_utils.h"
 
diff --git a/cpp/bench/ann/src/raft_wrapper.h b/cpp/bench/ann/src/raft_wrapper.h
index 5843ba508e..29e88f9c93 100644
--- a/cpp/bench/ann/src/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft_wrapper.h
@@ -25,7 +25,7 @@
 #include <string>
 #include <type_traits>
 
-#include "ann.h"
+#include "ann.hpp"
 
 namespace raft_temp {
 
diff --git a/cpp/bench/ann/src/util.cpp b/cpp/bench/ann/src/util.cpp
index 3225e16e78..17636f76d7 100644
--- a/cpp/bench/ann/src/util.cpp
+++ b/cpp/bench/ann/src/util.cpp
@@ -21,7 +21,7 @@
 #include <cstring>
 #include <sstream>
 
-namespace benchmark {
+namespace raft::bench::ann {
 
 std::vector<std::string> split(const std::string& s, char delimiter)
 {
@@ -65,4 +65,4 @@ bool create_dir(const std::string& dir)
   return true;
 }
 
-}  // namespace benchmark
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/util.h b/cpp/bench/ann/src/util.h
index e317cee4ac..0599dc3bf2 100644
--- a/cpp/bench/ann/src/util.h
+++ b/cpp/bench/ann/src/util.h
@@ -23,7 +23,7 @@
 #include <string>
 #include <vector>
 
-namespace benchmark {
+namespace raft::bench::ann {
 
 class Timer {
  public:
@@ -77,6 +77,6 @@ void log_error(Ts... vs)
   log_("error", vs...);
 }
 
-}  // namespace benchmark
+}  // namespace raft::bench::ann
 
 #endif

From 2bd4ed9491bd74ad5b6f5943300f7c4ae0695a3e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Mar 2023 19:10:26 -0500
Subject: [PATCH 11/39] Fixing ggnn

---
 cpp/bench/ann/CMakeLists.txt                  |  24 +-
 cpp/bench/ann/src/{ => common}/ann.hpp        |   0
 cpp/bench/ann/src/{ => common}/benchmark.hpp  |   0
 .../ann/src/{ => common}/benchmark_util.hpp   |   0
 cpp/bench/ann/src/{ => common}/conf.cpp       |   0
 cpp/bench/ann/src/{ => common}/conf.h         |   0
 cpp/bench/ann/src/{ => common}/cudart_util.h  |   0
 cpp/bench/ann/src/{ => common}/dataset.h      |   0
 cpp/bench/ann/src/{ => common}/util.cpp       |   0
 cpp/bench/ann/src/{ => common}/util.h         |   0
 .../ann/src/{ => faiss}/faiss_benchmark.cu    |   2 +-
 .../ann/src/{ => faiss}/faiss_benchmark.cuh   |   2 +-
 cpp/bench/ann/src/{ => faiss}/faiss_wrapper.h |   2 +-
 .../ann/src/{ => ggnn}/ggnn_benchmark.cu      |   2 +-
 .../ann/src/{ => ggnn}/ggnn_benchmark.cuh     |   2 +-
 cpp/bench/ann/src/{ => ggnn}/ggnn_wrapper.cuh |   5 +-
 .../src/{ => hnswlib}/hnswlib_benchmark.cpp   |   2 +-
 .../src/{ => hnswlib}/hnswlib_benchmark.hpp   |   4 +-
 .../ann/src/{ => hnswlib}/hnswlib_wrapper.h   |   2 +-
 cpp/bench/ann/src/multigpu.cuh                | 515 ------------------
 .../ann/src/{ => raft}/raft_ann_bench_utils.h |   0
 .../ann/src/{ => raft}/raft_benchmark.cu      |   2 +-
 .../ann/src/{ => raft}/raft_benchmark.cuh     |   2 +-
 cpp/bench/ann/src/{ => raft}/raft_ivf_flat.cu |   0
 .../src/{ => raft}/raft_ivf_flat_wrapper.h    |   4 +-
 cpp/bench/ann/src/{ => raft}/raft_ivf_pq.cu   |   0
 .../ann/src/{ => raft}/raft_ivf_pq_wrapper.h  |   4 +-
 cpp/bench/ann/src/{ => raft}/raft_wrapper.h   |   2 +-
 .../third_party => cmake}/patches/ggnn.patch  |   0
 .../patches/nlohmann_json.patch}              |   0
 cpp/cmake/thirdparty/get_ggnn.cmake           |  13 +-
 cpp/cmake/thirdparty/get_glog.cmake           |  49 ++
 cpp/cmake/thirdparty/get_hnswlib.cmake        |   3 +-
 33 files changed, 93 insertions(+), 548 deletions(-)
 rename cpp/bench/ann/src/{ => common}/ann.hpp (100%)
 rename cpp/bench/ann/src/{ => common}/benchmark.hpp (100%)
 rename cpp/bench/ann/src/{ => common}/benchmark_util.hpp (100%)
 rename cpp/bench/ann/src/{ => common}/conf.cpp (100%)
 rename cpp/bench/ann/src/{ => common}/conf.h (100%)
 rename cpp/bench/ann/src/{ => common}/cudart_util.h (100%)
 rename cpp/bench/ann/src/{ => common}/dataset.h (100%)
 rename cpp/bench/ann/src/{ => common}/util.cpp (100%)
 rename cpp/bench/ann/src/{ => common}/util.h (100%)
 rename cpp/bench/ann/src/{ => faiss}/faiss_benchmark.cu (95%)
 rename cpp/bench/ann/src/{ => faiss}/faiss_benchmark.cuh (99%)
 rename cpp/bench/ann/src/{ => faiss}/faiss_wrapper.h (99%)
 rename cpp/bench/ann/src/{ => ggnn}/ggnn_benchmark.cu (95%)
 rename cpp/bench/ann/src/{ => ggnn}/ggnn_benchmark.cuh (99%)
 rename cpp/bench/ann/src/{ => ggnn}/ggnn_wrapper.cuh (99%)
 rename cpp/bench/ann/src/{ => hnswlib}/hnswlib_benchmark.cpp (95%)
 rename cpp/bench/ann/src/{ => hnswlib}/hnswlib_benchmark.hpp (98%)
 rename cpp/bench/ann/src/{ => hnswlib}/hnswlib_wrapper.h (99%)
 delete mode 100644 cpp/bench/ann/src/multigpu.cuh
 rename cpp/bench/ann/src/{ => raft}/raft_ann_bench_utils.h (100%)
 rename cpp/bench/ann/src/{ => raft}/raft_benchmark.cu (95%)
 rename cpp/bench/ann/src/{ => raft}/raft_benchmark.cuh (99%)
 rename cpp/bench/ann/src/{ => raft}/raft_ivf_flat.cu (100%)
 rename cpp/bench/ann/src/{ => raft}/raft_ivf_flat_wrapper.h (98%)
 rename cpp/bench/ann/src/{ => raft}/raft_ivf_pq.cu (100%)
 rename cpp/bench/ann/src/{ => raft}/raft_ivf_pq_wrapper.h (99%)
 rename cpp/bench/ann/src/{ => raft}/raft_wrapper.h (99%)
 rename cpp/{bench/ann/third_party => cmake}/patches/ggnn.patch (100%)
 rename cpp/{bench/ann/third_party/patches/json.patch => cmake/patches/nlohmann_json.patch} (100%)
 create mode 100644 cpp/cmake/thirdparty/get_glog.cmake

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index dca63538b7..80eb66af7d 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -21,8 +21,8 @@ option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in bench
 option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" OFF)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" OFF)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
-option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
 
 set(RAFT_ANN_BENCH_USE_FAISS OFF)
 if(RAFT_ANN_BENCH_USE_FAISS_BFKNN
@@ -69,7 +69,8 @@ function(ConfigureCuannBench)
   set(BENCH_NAME ${ConfigureCuannBench_NAME}_ANN_BENCH)
 
   add_executable(
-    ${BENCH_NAME} ${ConfigureCuannBench_PATH} bench/ann/src/conf.cpp bench/ann/src/util.cpp
+    ${BENCH_NAME} ${ConfigureCuannBench_PATH} bench/ann/src/common/conf.cpp
+                  bench/ann/src/common/util.cpp
   )
   target_link_libraries(
     ${BENCH_NAME}
@@ -125,7 +126,7 @@ endfunction()
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureCuannBench(
-    NAME HNSWLIB PATH bench/ann/src/hnswlib_benchmark.cpp INCLUDES
+    NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS -mavx
   )
 endif()
@@ -135,9 +136,9 @@ if(RAFT_ANN_BENCH_USE_RAFT)
     NAME
     RAFT_IVF_PQ
     PATH
-    bench/ann/src/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft_ivf_pq.cu>
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft_ivf_flat.cu>
+    bench/ann/src/raft/raft_benchmark.cu
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft/raft_ivf_pq.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft/raft_ivf_flat.cu>
     LINKS
     raft::distance
     raft::nn
@@ -145,12 +146,15 @@ if(RAFT_ANN_BENCH_USE_RAFT)
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS)
-  ConfigureCuannBench(NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss_benchmark.cu LINKS faiss::faiss)
+  ConfigureCuannBench(
+    NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+  )
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_glog.cmake)
   ConfigureCuannBench(
-    NAME GGNN PATH bench/ann/src/ggnn_benchmark.cu INCLUDES
-    ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include
+    NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include LINKS glog::glog
   )
 endif()
diff --git a/cpp/bench/ann/src/ann.hpp b/cpp/bench/ann/src/common/ann.hpp
similarity index 100%
rename from cpp/bench/ann/src/ann.hpp
rename to cpp/bench/ann/src/common/ann.hpp
diff --git a/cpp/bench/ann/src/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
similarity index 100%
rename from cpp/bench/ann/src/benchmark.hpp
rename to cpp/bench/ann/src/common/benchmark.hpp
diff --git a/cpp/bench/ann/src/benchmark_util.hpp b/cpp/bench/ann/src/common/benchmark_util.hpp
similarity index 100%
rename from cpp/bench/ann/src/benchmark_util.hpp
rename to cpp/bench/ann/src/common/benchmark_util.hpp
diff --git a/cpp/bench/ann/src/conf.cpp b/cpp/bench/ann/src/common/conf.cpp
similarity index 100%
rename from cpp/bench/ann/src/conf.cpp
rename to cpp/bench/ann/src/common/conf.cpp
diff --git a/cpp/bench/ann/src/conf.h b/cpp/bench/ann/src/common/conf.h
similarity index 100%
rename from cpp/bench/ann/src/conf.h
rename to cpp/bench/ann/src/common/conf.h
diff --git a/cpp/bench/ann/src/cudart_util.h b/cpp/bench/ann/src/common/cudart_util.h
similarity index 100%
rename from cpp/bench/ann/src/cudart_util.h
rename to cpp/bench/ann/src/common/cudart_util.h
diff --git a/cpp/bench/ann/src/dataset.h b/cpp/bench/ann/src/common/dataset.h
similarity index 100%
rename from cpp/bench/ann/src/dataset.h
rename to cpp/bench/ann/src/common/dataset.h
diff --git a/cpp/bench/ann/src/util.cpp b/cpp/bench/ann/src/common/util.cpp
similarity index 100%
rename from cpp/bench/ann/src/util.cpp
rename to cpp/bench/ann/src/common/util.cpp
diff --git a/cpp/bench/ann/src/util.h b/cpp/bench/ann/src/common/util.h
similarity index 100%
rename from cpp/bench/ann/src/util.h
rename to cpp/bench/ann/src/common/util.h
diff --git a/cpp/bench/ann/src/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
similarity index 95%
rename from cpp/bench/ann/src/faiss_benchmark.cu
rename to cpp/bench/ann/src/faiss/faiss_benchmark.cu
index 5370f56e45..1730bd0683 100644
--- a/cpp/bench/ann/src/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "benchmark.hpp"
+#include "../common/benchmark.hpp"
 #include "faiss_benchmark.cuh"
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss_benchmark.cuh b/cpp/bench/ann/src/faiss/faiss_benchmark.cuh
similarity index 99%
rename from cpp/bench/ann/src/faiss_benchmark.cuh
rename to cpp/bench/ann/src/faiss/faiss_benchmark.cuh
index a8e8acc6f9..01a0cadeaa 100644
--- a/cpp/bench/ann/src/faiss_benchmark.cuh
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cuh
@@ -24,7 +24,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 #undef WARP_SIZE
 #include "faiss_wrapper.h"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
similarity index 99%
rename from cpp/bench/ann/src/faiss_wrapper.h
rename to cpp/bench/ann/src/faiss/faiss_wrapper.h
index b3ce5ca675..02070e05e7 100644
--- a/cpp/bench/ann/src/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -35,7 +35,7 @@
 #include <string>
 #include <type_traits>
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 
 namespace {
 
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
similarity index 95%
rename from cpp/bench/ann/src/ggnn_benchmark.cu
rename to cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index 6aa6ac0ec0..95f5763e36 100644
--- a/cpp/bench/ann/src/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "benchmark.hpp"
+#include "../common/benchmark.hpp"
 #include "ggnn_benchmark.cuh"
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn_benchmark.cuh b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh
similarity index 99%
rename from cpp/bench/ann/src/ggnn_benchmark.cuh
rename to cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh
index d23fb10d3e..a5b6957c0c 100644
--- a/cpp/bench/ann/src/ggnn_benchmark.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh
@@ -23,7 +23,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 #undef WARP_SIZE
 #include "ggnn_wrapper.cuh"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
similarity index 99%
rename from cpp/bench/ann/src/ggnn_wrapper.cuh
rename to cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index de61094d27..e6925a1dec 100644
--- a/cpp/bench/ann/src/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -19,8 +19,9 @@
 #include <memory>
 #include <stdexcept>
 
-#include "ann.hpp"
-#include "cudart_util.h"
+#include "../common/ann.hpp"
+#include "../common/benchmark_util.hpp"
+#include "../common/cudart_util.h"
 #include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/ann/src/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
similarity index 95%
rename from cpp/bench/ann/src/hnswlib_benchmark.cpp
rename to cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index ecf29677ac..75192d68a5 100644
--- a/cpp/bench/ann/src/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -14,6 +14,6 @@
  * limitations under the License.
  */
 #include "hnswlib_benchmark.hpp"
-#include "benchmark.hpp"
+#include "../common/benchmark.hpp"
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib_benchmark.hpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp
similarity index 98%
rename from cpp/bench/ann/src/hnswlib_benchmark.hpp
rename to cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp
index 5e59a88f9b..6d3851799e 100644
--- a/cpp/bench/ann/src/hnswlib_benchmark.hpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp
@@ -24,9 +24,9 @@
 #include <type_traits>
 #include <utility>
 
-#include "benchmark_util.hpp"
+#include "../common/benchmark_util.hpp"
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 #undef WARP_SIZE
 #include "hnswlib_wrapper.h"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
similarity index 99%
rename from cpp/bench/ann/src/hnswlib_wrapper.h
rename to cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 867199723b..379a0dc482 100644
--- a/cpp/bench/ann/src/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -31,7 +31,7 @@
 #include <utility>
 #include <vector>
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 #include <hnswlib.h>
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/ann/src/multigpu.cuh b/cpp/bench/ann/src/multigpu.cuh
deleted file mode 100644
index 4cbf108f5c..0000000000
--- a/cpp/bench/ann/src/multigpu.cuh
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MULTIGPU_H_
-#define MULTIGPU_H_
-
-#include <nccl.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cstdio>
-#include <cuivfl/src/topk/warp_sort_topk.cuh>
-#include <fstream>
-#include <thread>
-#include <type_traits>
-#include <vector>
-
-#include "ann.hpp"
-#include "cudart_util.h"
-
-#define NCCLCHECK(cmd)                                                                      \
-  do {                                                                                      \
-    ncclResult_t r = cmd;                                                                   \
-    if (r != ncclSuccess) {                                                                 \
-      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r)); \
-      exit(EXIT_FAILURE);                                                                   \
-    }                                                                                       \
-  } while (0)
-
-namespace {
-
-__global__ void add_index_offset_kernel(size_t* arr, size_t len, size_t offset)
-{
-  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  if (id < len) arr[id] += offset;
-}
-
-template <typename T>
-__global__ void reset_search_data_placement_kernel(
-  T* arr, T* from, int len, int k, int batch_size, int dev_cnt)
-{
-  size_t id       = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t dev      = id / (k * batch_size);
-  size_t batch_id = id % (k * batch_size) / k;
-  size_t to_id    = batch_id * k * dev_cnt + dev * k + id % k;
-  if (id < len) arr[to_id] = from[id];
-}
-
-template <typename T>
-constexpr ncclDataType_t get_nccl_datatype()
-{
-  if (std::is_same_v<T, float>) {
-    static_assert(sizeof(float) == 4, "float size is not 32 bits");
-    return ncclFloat32;
-  }
-  if (std::is_same_v<T, uint64_t>) return ncclUint64;
-  if (std::is_same_v<T, int8_t>) return ncclInt8;
-  if (std::is_same_v<T, uint8_t>) return ncclUint8;
-  if (std::is_same_v<T, __half>) return ncclFloat16;
-  throw std::runtime_error("no supported nccl datatype");
-}
-
-class DeviceRestorer {
- public:
-  DeviceRestorer() { ANN_CUDA_CHECK(cudaGetDevice(&cur_dev)); }
-  ~DeviceRestorer() { ANN_CUDA_CHECK(cudaSetDevice(cur_dev)); }
-
- private:
-  int cur_dev;
-};
-
-}  // namespace
-
-namespace raft::bench::ann {
-
-template <typename T, typename Algo>
-class MultiGpuANN : public ANN<T> {
- public:
-  using typename ANN<T>::AnnSearchParam;
-
-  MultiGpuANN(Metric metric,
-              int dim,
-              const typename Algo::BuildParam& param,
-              const std::vector<int>& dev_list);
-
-  ~MultiGpuANN();
-
-  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
-
-  void set_search_param(const AnnSearchParam& param) override;
-
-  void search(const T* queries,
-              int batch_size,
-              int k,
-              size_t* neighbors,
-              float* distances,
-              cudaStream_t stream = 0) const override;
-
-  void save(const std::string& file) const override;
-  void load(const std::string& file) override;
-
-  AlgoProperty get_property() const override
-  {
-    AlgoProperty property;
-    if (dev_ann_property_.dataset_memory_type == MemoryType::Host) {
-      property.dataset_memory_type = MemoryType::Host;
-    } else if (dev_ann_property_.dataset_memory_type == MemoryType::Device) {
-      property.dataset_memory_type = MemoryType::HostMmap;
-    } else {
-      throw std::runtime_error("multigpu: invalid device algo dataset memory type");
-    }
-    property.query_memory_type        = MemoryType::Device;
-    property.need_dataset_when_search = dev_ann_property_.need_dataset_when_search;
-    return property;
-  }
-
-  void set_search_dataset(const T* dataset, size_t nrow) override;
-
- private:
-  void distribute_dataset_(const T* dataset, size_t nrow);
-  void add_index_offset_(size_t* arr, size_t len, size_t offset, cudaStream_t stream) const;
-  void set_wait_for_all_streams_(cudaStream_t stream) const;
-  template <typename U>
-  void reset_search_data_placement_(
-    U* arr, U* from, int k, int batch_size, size_t all_result_size, cudaStream_t stream) const;
-
-  const static int block_size_ = 256;
-  using ANN<T>::dim_;
-  std::vector<cudaEvent_t> event_;
-  std::vector<std::unique_ptr<Algo>> dev_ann_interface_;
-  AlgoProperty dev_ann_property_;
-  std::vector<int> dev_id_;
-  std::vector<T*> d_data_;
-  std::vector<cudaStream_t> dev_stream_;
-  std::vector<cudaMemPool_t> mempool_;
-  std::vector<ncclComm_t> comms_;
-  std::vector<size_t> dev_data_offset_;
-  int dev_cnt_;
-  size_t nrow_;
-};
-
-template <typename T, typename Algo>
-MultiGpuANN<T, Algo>::MultiGpuANN(Metric metric,
-                                  int dim,
-                                  const typename Algo::BuildParam& param,
-                                  const std::vector<int>& dev_list)
-  : ANN<T>(metric, dim),
-    dev_cnt_(dev_list.size()),
-    dev_ann_interface_(dev_list.size()),
-    dev_id_(dev_list),
-    d_data_(dev_list.size()),
-    dev_stream_(dev_list.size()),
-    event_(dev_list.size()),
-    mempool_(dev_list.size()),
-    comms_(dev_list.size()),
-    dev_data_offset_(dev_list.size())
-{
-  DeviceRestorer restore_dev;
-  uint64_t threshold = UINT64_MAX;
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    ANN_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool_[i], dev_id_[i]));
-    ANN_CUDA_CHECK(
-      cudaMemPoolSetAttribute(mempool_[i], cudaMemPoolAttrReleaseThreshold, &threshold));
-    std::vector<cudaMemAccessDesc> desc;
-    for (int j = 0; j < dev_cnt_; j++) {
-      if (i == j) continue;
-      cudaMemAccessDesc tmp_desc;
-      tmp_desc.location.type = cudaMemLocationTypeDevice;
-      tmp_desc.location.id   = dev_id_[j];
-      tmp_desc.flags         = cudaMemAccessFlagsProtReadWrite;
-      desc.push_back(tmp_desc);
-    }
-    ANN_CUDA_CHECK(cudaMemPoolSetAccess(mempool_[i], desc.data(), desc.size()));
-    ANN_CUDA_CHECK(cudaStreamCreate(&dev_stream_[i]));
-    ANN_CUDA_CHECK(cudaEventCreate(&event_[i], cudaEventDisableTiming));
-    dev_ann_interface_[i] = std::make_unique<Algo>(metric, dim, param);
-  }
-  NCCLCHECK(ncclCommInitAll(comms_.data(), dev_cnt_, dev_id_.data()));
-
-  dev_ann_property_ = dev_ann_interface_[0]->get_property();
-  if (dev_ann_property_.query_memory_type != MemoryType::Device) {
-    throw std::runtime_error("multigpu: query_memory_type of dev_algo must be DEVICE!");
-  }
-}
-
-template <typename T, typename Algo>
-MultiGpuANN<T, Algo>::~MultiGpuANN()
-{
-  DeviceRestorer restore_dev;
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    if (d_data_[i] && dev_ann_property_.dataset_memory_type == MemoryType::Device) {
-      ANN_CUDA_CHECK(cudaFree(d_data_[i]));
-    }
-    ANN_CUDA_CHECK(cudaStreamDestroy(dev_stream_[i]));
-    ANN_CUDA_CHECK(cudaEventDestroy(event_[i]));
-    NCCLCHECK(ncclCommDestroy(comms_[i]));
-  }
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::build(const T* dataset, size_t nrow, cudaStream_t stream)
-{
-  DeviceRestorer restore_dev;
-  distribute_dataset_(dataset, nrow);
-  nrow_ = nrow;
-
-  std::vector<std::thread> threads;
-
-  size_t basic_size = nrow / dev_cnt_;
-  size_t offset     = 0;
-  int mod           = nrow % dev_cnt_;
-  for (int i = 0; i < dev_cnt_; i++) {
-    size_t data_size = basic_size + (mod > i ? 1 : 0);
-    threads.emplace_back([&, i, data_size]() {
-      ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-      dev_ann_interface_[i]->build(d_data_[i], data_size, dev_stream_[i]);
-    });
-    dev_data_offset_[i] = offset;
-    offset += data_size;
-  }
-  for (auto& it : threads)
-    it.join();
-
-  set_wait_for_all_streams_(stream);
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::set_search_param(const AnnSearchParam& param)
-{
-  DeviceRestorer restore_dev;
-  auto search_param = dynamic_cast<const typename Algo::SearchParam&>(param);
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    dev_ann_interface_[i]->set_search_param(search_param);
-  }
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::search(const T* queries,
-                                  int batch_size,
-                                  int k,
-                                  size_t* neighbors,
-                                  float* distances,
-                                  cudaStream_t stream) const
-{
-  DeviceRestorer restore_dev;
-
-  std::vector<T*> d_queries(dev_cnt_);
-  std::vector<size_t*> d_neighbors(dev_cnt_);
-  std::vector<float*> d_distances(dev_cnt_);
-
-  float* candidate_distances;
-  float* result_distances;
-  size_t* candidate_neighbors;
-  size_t* result_neighbors;
-
-  int cur_dev;
-  ANN_CUDA_CHECK(cudaGetDevice(&cur_dev));
-
-  auto cur_dev_it = std::find(dev_id_.begin(), dev_id_.end(), cur_dev);
-  if (cur_dev_it == dev_id_.end()) {
-    throw std::runtime_error("current device is not in dev_list!");
-  }
-  int cur_dev_id = cur_dev_it - dev_id_.begin();
-
-  size_t single_dev_result_size = static_cast<size_t>(k) * batch_size;
-  size_t all_result_size        = single_dev_result_size * dev_cnt_;
-
-  ANN_CUDA_CHECK(cudaMallocAsync(
-    &candidate_distances, all_result_size * sizeof(float), dev_stream_[cur_dev_id]));
-  ANN_CUDA_CHECK(cudaMallocAsync(
-    &candidate_neighbors, all_result_size * sizeof(size_t), dev_stream_[cur_dev_id]));
-  ANN_CUDA_CHECK(
-    cudaMallocAsync(&result_distances, all_result_size * sizeof(float), dev_stream_[cur_dev_id]));
-  ANN_CUDA_CHECK(
-    cudaMallocAsync(&result_neighbors, all_result_size * sizeof(size_t), dev_stream_[cur_dev_id]));
-
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    ANN_CUDA_CHECK(cudaMallocAsync(&d_queries[i], batch_size * dim_ * sizeof(T), dev_stream_[i]));
-    ANN_CUDA_CHECK(
-      cudaMallocAsync(&d_neighbors[i], single_dev_result_size * sizeof(size_t), dev_stream_[i]));
-    ANN_CUDA_CHECK(
-      cudaMallocAsync(&d_distances[i], single_dev_result_size * sizeof(float), dev_stream_[i]));
-  }
-  NCCLCHECK(ncclGroupStart());
-  for (int i = 0; i < dev_cnt_; i++) {
-    NCCLCHECK(ncclBroadcast(queries,
-                            d_queries[i],
-                            batch_size * dim_,
-                            get_nccl_datatype<T>(),
-                            cur_dev_id,
-                            comms_[i],
-                            dev_stream_[i]));
-  }
-  NCCLCHECK(ncclGroupEnd());
-
-  std::vector<std::thread> threads;
-
-  for (int i = 0; i < dev_cnt_; i++) {
-    threads.emplace_back([&, i]() {
-      ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-      dev_ann_interface_[i]->search(
-        d_queries[i], batch_size, k, d_neighbors[i], d_distances[i], dev_stream_[i]);
-      add_index_offset_(
-        d_neighbors[i], single_dev_result_size, dev_data_offset_[i], dev_stream_[i]);
-    });
-  }
-
-  for (auto& it : threads)
-    it.join();
-
-  NCCLCHECK(ncclGroupStart());
-  for (int i = 0; i < dev_cnt_; i++) {
-    NCCLCHECK(ncclRecv(result_distances + i * single_dev_result_size,
-                       single_dev_result_size,
-                       get_nccl_datatype<float>(),
-                       i,
-                       comms_[cur_dev_id],
-                       dev_stream_[cur_dev_id]));
-    NCCLCHECK(ncclRecv(result_neighbors + i * single_dev_result_size,
-                       single_dev_result_size,
-                       get_nccl_datatype<size_t>(),
-                       i,
-                       comms_[cur_dev_id],
-                       dev_stream_[cur_dev_id]));
-  }
-  for (int i = 0; i < dev_cnt_; i++) {
-    NCCLCHECK(ncclSend(d_distances[i],
-                       single_dev_result_size,
-                       get_nccl_datatype<float>(),
-                       cur_dev_id,
-                       comms_[i],
-                       dev_stream_[i]));
-    NCCLCHECK(ncclSend(d_neighbors[i],
-                       single_dev_result_size,
-                       get_nccl_datatype<size_t>(),
-                       cur_dev_id,
-                       comms_[i],
-                       dev_stream_[i]));
-  }
-  NCCLCHECK(ncclGroupEnd());
-
-  set_wait_for_all_streams_(stream);
-
-  ANN_CUDA_CHECK(cudaSetDevice(dev_id_[cur_dev_id]));
-
-  reset_search_data_placement_(
-    candidate_distances, result_distances, k, batch_size, all_result_size, stream);
-  reset_search_data_placement_(
-    candidate_neighbors, result_neighbors, k, batch_size, all_result_size, stream);
-
-  void* warp_sort_topk_buf = nullptr;
-  size_t buf_size          = 0;
-
-  nv::warp_sort_topk<float, size_t>(nullptr,
-                                    buf_size,
-                                    candidate_distances,
-                                    candidate_neighbors,
-                                    batch_size,
-                                    k * dev_cnt_,
-                                    k,
-                                    distances,
-                                    neighbors,
-                                    false,
-                                    stream);
-  ANN_CUDA_CHECK(cudaMallocAsync(&warp_sort_topk_buf, buf_size, stream));
-  nv::warp_sort_topk<float, size_t>(warp_sort_topk_buf,
-                                    buf_size,
-                                    candidate_distances,
-                                    candidate_neighbors,
-                                    batch_size,
-                                    k * dev_cnt_,
-                                    k,
-                                    distances,
-                                    neighbors,
-                                    false,
-                                    stream);
-
-  ANN_CUDA_CHECK(cudaFreeAsync(warp_sort_topk_buf, stream));
-  ANN_CUDA_CHECK(cudaFreeAsync(candidate_neighbors, stream));
-  ANN_CUDA_CHECK(cudaFreeAsync(candidate_distances, stream));
-  ANN_CUDA_CHECK(cudaFreeAsync(result_neighbors, stream));
-  ANN_CUDA_CHECK(cudaFreeAsync(result_distances, stream));
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    ANN_CUDA_CHECK(cudaFreeAsync(d_queries[i], stream));
-    ANN_CUDA_CHECK(cudaFreeAsync(d_neighbors[i], stream));
-    ANN_CUDA_CHECK(cudaFreeAsync(d_distances[i], stream));
-  }
-  ANN_CUDA_CHECK_LAST_ERROR()
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::save(const std::string& file) const
-{
-  DeviceRestorer restore_dev;
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    dev_ann_interface_[i]->save(file + "_" + std::to_string(i));
-  }
-  std::ofstream ofs(file);
-  if (!ofs) { throw std::runtime_error("can't open index file: " + file); }
-  ofs << nrow_ << '\n';
-  for (auto it : dev_data_offset_)
-    ofs << it << '\n';
-  ofs.close();
-  if (!ofs) { throw std::runtime_error("can't write to index file: " + file); }
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::load(const std::string& file)
-{
-  DeviceRestorer restore_dev;
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    dev_ann_interface_[i]->load(file + "_" + std::to_string(i));
-  }
-  std::ifstream ifs(file);
-  if (!ifs) { throw std::runtime_error("can't open index file: " + file); }
-  ifs >> nrow_;
-  for (auto& it : dev_data_offset_)
-    ifs >> it;
-  ifs.close();
-  if (!ifs) { throw std::runtime_error("can't read from index file: " + file); }
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::set_search_dataset(const T* dataset, size_t nrow)
-{
-  DeviceRestorer restore_dev;
-  distribute_dataset_(dataset, nrow);
-  size_t basic_size = nrow / dev_cnt_;
-  size_t offset     = 0;
-  int mod           = nrow % dev_cnt_;
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    size_t data_size = basic_size + (mod > i ? 1 : 0);
-    dev_ann_interface_[i]->set_search_dataset(d_data_[i], data_size);
-    offset += data_size;
-  }
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::distribute_dataset_(const T* dataset, size_t nrow)
-{
-  size_t basic_size = nrow / dev_cnt_;
-  size_t offset     = 0;
-  int mod           = nrow % dev_cnt_;
-  for (int i = 0; i < dev_cnt_; i++) {
-    size_t data_size = (basic_size + (mod > i ? 1 : 0)) * dim_;
-    if (dev_ann_property_.dataset_memory_type == MemoryType::Device) {
-      ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-      ANN_CUDA_CHECK(cudaMalloc(&d_data_[i], data_size * sizeof(T)));
-      ANN_CUDA_CHECK(cudaMemcpyAsync(d_data_[i],
-                                     dataset + offset,
-                                     data_size * sizeof(T),
-                                     cudaMemcpyHostToDevice,
-                                     dev_stream_[i]));
-    } else {
-      d_data_[i] = const_cast<T*>(dataset) + offset;
-    }
-    offset += data_size;
-  }
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::add_index_offset_(size_t* arr,
-                                             size_t len,
-                                             size_t offset,
-                                             cudaStream_t stream) const
-{
-  add_index_offset_kernel<<<(len + block_size_ - 1) / block_size_, block_size_, 0, stream>>>(
-    arr, len, offset);
-}
-
-template <typename T, typename Algo>
-void MultiGpuANN<T, Algo>::set_wait_for_all_streams_(cudaStream_t stream) const
-{
-  for (int i = 0; i < dev_cnt_; i++) {
-    ANN_CUDA_CHECK(cudaSetDevice(dev_id_[i]));
-    ANN_CUDA_CHECK(cudaEventRecord(event_[i], dev_stream_[i]));
-    ANN_CUDA_CHECK(cudaStreamWaitEvent(stream, event_[i], 0));
-  }
-}
-
-template <typename T, typename Algo>
-template <typename U>
-void MultiGpuANN<T, Algo>::reset_search_data_placement_(
-  U* arr, U* from, int k, int batch_size, size_t all_result_size, cudaStream_t stream) const
-{
-  reset_search_data_placement_kernel<<<(all_result_size + block_size_ - 1) / block_size_,
-                                       block_size_,
-                                       0,
-                                       stream>>>(
-    arr, from, all_result_size, k, batch_size, dev_cnt_);
-}
-
-}  // namespace raft::bench::ann
-
-#endif
diff --git a/cpp/bench/ann/src/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
similarity index 100%
rename from cpp/bench/ann/src/raft_ann_bench_utils.h
rename to cpp/bench/ann/src/raft/raft_ann_bench_utils.h
diff --git a/cpp/bench/ann/src/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
similarity index 95%
rename from cpp/bench/ann/src/raft_benchmark.cu
rename to cpp/bench/ann/src/raft/raft_benchmark.cu
index dc34ae2401..8ec375fc02 100644
--- a/cpp/bench/ann/src/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "benchmark.hpp"
+#include "../common/benchmark.hpp"
 #include "raft_benchmark.cuh"
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft_benchmark.cuh b/cpp/bench/ann/src/raft/raft_benchmark.cuh
similarity index 99%
rename from cpp/bench/ann/src/raft_benchmark.cuh
rename to cpp/bench/ann/src/raft/raft_benchmark.cuh
index 1eb195537b..eeb24e6a09 100644
--- a/cpp/bench/ann/src/raft_benchmark.cuh
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cuh
@@ -22,7 +22,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 #undef WARP_SIZE
 #ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
 #include "raft_wrapper.h"
diff --git a/cpp/bench/ann/src/raft_ivf_flat.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
similarity index 100%
rename from cpp/bench/ann/src/raft_ivf_flat.cu
rename to cpp/bench/ann/src/raft/raft_ivf_flat.cu
diff --git a/cpp/bench/ann/src/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
similarity index 98%
rename from cpp/bench/ann/src/raft_ivf_flat_wrapper.h
rename to cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index c17303b5c1..1004b3d184 100644
--- a/cpp/bench/ann/src/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -35,8 +35,8 @@
 #include <string>
 #include <type_traits>
 
-#include "ann.hpp"
-#include "cudart_util.h"
+#include "../common/ann.hpp"
+#include "../common/cudart_util.h"
 #include "raft_ann_bench_utils.h"
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/ann/src/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
similarity index 100%
rename from cpp/bench/ann/src/raft_ivf_pq.cu
rename to cpp/bench/ann/src/raft/raft_ivf_pq.cu
diff --git a/cpp/bench/ann/src/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
similarity index 99%
rename from cpp/bench/ann/src/raft_ivf_pq_wrapper.h
rename to cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index f775b082d2..bbe446aed9 100644
--- a/cpp/bench/ann/src/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -33,8 +33,8 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <type_traits>
 
-#include "ann.hpp"
-#include "cudart_util.h"
+#include "../common/ann.hpp"
+#include "../common/cudart_util.h"
 #include "raft_ann_bench_utils.h"
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/ann/src/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
similarity index 99%
rename from cpp/bench/ann/src/raft_wrapper.h
rename to cpp/bench/ann/src/raft/raft_wrapper.h
index 29e88f9c93..5e538fdd23 100644
--- a/cpp/bench/ann/src/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -25,7 +25,7 @@
 #include <string>
 #include <type_traits>
 
-#include "ann.hpp"
+#include "../common/ann.hpp"
 
 namespace raft_temp {
 
diff --git a/cpp/bench/ann/third_party/patches/ggnn.patch b/cpp/cmake/patches/ggnn.patch
similarity index 100%
rename from cpp/bench/ann/third_party/patches/ggnn.patch
rename to cpp/cmake/patches/ggnn.patch
diff --git a/cpp/bench/ann/third_party/patches/json.patch b/cpp/cmake/patches/nlohmann_json.patch
similarity index 100%
rename from cpp/bench/ann/third_party/patches/json.patch
rename to cpp/cmake/patches/nlohmann_json.patch
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
index a448ae0078..708acb6b8d 100644
--- a/cpp/cmake/thirdparty/get_ggnn.cmake
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -19,12 +19,19 @@ function(find_and_configure_ggnn)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN} )
 
-    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
-    IF ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/ggnn-src)
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ )
+    if (NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/ggnn-src/)
+
         execute_process (
                 COMMAND git clone "https://github.com/${PKG_FORK}/ggnn" --branch ${PKG_PINNED_TAG} ggnn-src
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ )
-    endif ( )
+
+        message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}")
+        execute_process (
+                COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.patch
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src
+        )
+    endif()
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake
new file mode 100644
index 0000000000..9334224de5
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_glog.cmake
@@ -0,0 +1,49 @@
+#=============================================================================
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_glog)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(glog ${PKG_VERSION}
+            GLOBAL_TARGETS      glog::glog
+            BUILD_EXPORT_SET    raft-exports
+            INSTALL_EXPORT_SET  raft-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/glog.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR          cpp
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            )
+
+    if(glog_ADDED)
+        message(VERBOSE "RAFT: Using glog located in ${glog_SOURCE_DIR}")
+    else()
+        message(VERBOSE "RAFT: Using glog located in ${glog_DIR}")
+    endif()
+
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_glog_SOURCE=/path/to/local/glog
+find_and_configure_glog(VERSION 0.6.0
+        FORK             google
+        PINNED_TAG       v0.6.0
+        EXCLUDE_FROM_ALL ON
+        )
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 72c17a84c0..f092ff6428 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -19,8 +19,7 @@ function(find_and_configure_hnswlib)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN} )
 
-    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
-
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} )
     if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src )
 
         execute_process (

From 7451ab40b7eb8923c0beb9495dfb8fc2884241bc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Mar 2023 21:21:20 -0500
Subject: [PATCH 12/39] Enabling all ann-benchmark variants

---
 cpp/bench/ann/CMakeLists.txt              | 14 +++++++-------
 cpp/bench/ann/src/faiss/faiss_wrapper.h   |  2 ++
 cpp/bench/ann/src/ggnn/ggnn_benchmark.cu  |  1 +
 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh   |  6 ++----
 cpp/bench/ann/src/raft/raft_benchmark.cuh |  1 +
 5 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 80eb66af7d..9950d99655 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -15,13 +15,13 @@
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
-option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
 
 set(RAFT_ANN_BENCH_USE_FAISS OFF)
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
index 02070e05e7..0ef90c9fc6 100644
--- a/cpp/bench/ann/src/faiss/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -36,6 +36,8 @@
 #include <type_traits>
 
 #include "../common/ann.hpp"
+#include "../common/benchmark_util.hpp"
+#include "../common/cudart_util.h"
 
 namespace {
 
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index 95f5763e36..87a4872125 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "../common/benchmark.hpp"
 #include "ggnn_benchmark.cuh"
 
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index e6925a1dec..3d1d83aeb1 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef GGNN_WRAPPER_H_
-#define GGNN_WRAPPER_H_
+
+#pragma once
 
 #include <memory>
 #include <stdexcept>
@@ -306,5 +306,3 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
 }
 
 }  // namespace raft::bench::ann
-
-#endif
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cuh b/cpp/bench/ann/src/raft/raft_benchmark.cuh
index eeb24e6a09..4bee91db7b 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cuh
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cuh
@@ -23,6 +23,7 @@
 #include <utility>
 
 #include "../common/ann.hpp"
+#include "../common/benchmark_util.hpp"
 #undef WARP_SIZE
 #ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
 #include "raft_wrapper.h"

From 7b9bc9556d2b2bacc05625e0488049f52609ef9f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Mar 2023 21:43:55 -0500
Subject: [PATCH 13/39] Removing more headers. Consolidating ann bench
 cudart_util into RAFT's cudart_utils.hpp

---
 cpp/bench/ann/src/common/benchmark.hpp        |  50 ++--
 cpp/bench/ann/src/common/cudart_util.h        |  64 ------
 cpp/bench/ann/src/common/dataset.h            |  14 +-
 cpp/bench/ann/src/faiss/faiss_benchmark.cu    | 132 ++++++++++-
 cpp/bench/ann/src/faiss/faiss_benchmark.cuh   | 148 ------------
 cpp/bench/ann/src/faiss/faiss_wrapper.h       |   4 +-
 cpp/bench/ann/src/ggnn/ggnn_benchmark.cu      | 108 ++++++++-
 cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh     | 123 ----------
 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh       |  10 +-
 .../ann/src/hnswlib/hnswlib_benchmark.cpp     | 102 ++++++++-
 .../ann/src/hnswlib/hnswlib_benchmark.hpp     | 118 ----------
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h |   6 +-
 cpp/bench/ann/src/raft/raft_benchmark.cu      | 199 +++++++++++++++-
 cpp/bench/ann/src/raft/raft_benchmark.cuh     | 213 ------------------
 .../ann/src/raft/raft_ivf_flat_wrapper.h      |   8 +-
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h  |   9 +-
 cpp/bench/ann/src/raft/raft_wrapper.h         |   7 +-
 cpp/include/raft/util/cudart_utils.hpp        |  28 +--
 18 files changed, 587 insertions(+), 756 deletions(-)
 delete mode 100644 cpp/bench/ann/src/common/cudart_util.h
 delete mode 100644 cpp/bench/ann/src/faiss/faiss_benchmark.cuh
 delete mode 100644 cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh
 delete mode 100644 cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp
 delete mode 100644 cpp/bench/ann/src/raft/raft_benchmark.cuh

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 7ba40512a7..6eb75976f3 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -171,7 +171,7 @@ template <typename T>
 void build(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
 {
   cudaStream_t stream;
-  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
   log_info(
     "base set from dataset '%s', #vector = %zu", dataset->name().c_str(), dataset->base_set_size());
@@ -199,19 +199,20 @@ void build(const Dataset<T>* dataset, const std::vector<Configuration::Index>& i
     }
 
     log_info("building index '%s'", index.name.c_str());
-    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 #ifdef NVTX
     nvtxRangePush("build");
 #endif
     Timer timer;
     algo->build(base_set_ptr, dataset->base_set_size(), stream);
-    ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     float elapsed_ms = timer.elapsed_ms();
 #ifdef NVTX
     nvtxRangePop();
 #endif
     log_info("built index in %.2f seconds", elapsed_ms / 1000.0f);
-    ANN_CUDA_CHECK_LAST_ERROR();
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
 
     algo->save(index.file);
     write_build_info(index.file,
@@ -224,7 +225,7 @@ void build(const Dataset<T>* dataset, const std::vector<Configuration::Index>& i
     log_info("saved index to %s", index.file.c_str());
   }
 
-  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 inline void write_search_result(const std::string& file_prefix,
@@ -277,7 +278,7 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
 {
   if (indices.empty()) { return; }
   cudaStream_t stream;
-  ANN_CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
   log_info("loading query set from dataset '%s', #vector = %zu",
            dataset->name().c_str(),
@@ -307,8 +308,8 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
   search_times.reserve(num_batches);
   std::size_t* d_neighbors;
   float* d_distances;
-  ANN_CUDA_CHECK(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
-  ANN_CUDA_CHECK(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
+  RAFT_CUDA_TRY(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
+  RAFT_CUDA_TRY(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
 
   for (const auto& index : indices) {
     log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
@@ -356,8 +357,8 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
       log_info("search with param: %s", index.search_params[i].dump().c_str());
 
       if (algo_property.query_memory_type == MemoryType::Device) {
-        ANN_CUDA_CHECK(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
-        ANN_CUDA_CHECK(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
+        RAFT_CUDA_TRY(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
+        RAFT_CUDA_TRY(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
       } else {
         memset(neighbors, 0, query_set_size * k * sizeof(*neighbors));
         memset(distances, 0, query_set_size * k * sizeof(*distances));
@@ -371,7 +372,7 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
         for (std::size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
           std::size_t row       = batch_id * batch_size;
           int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
-          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+          RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 #ifdef NVTX
           string nvtx_label = "batch" + to_string(batch_id);
           if (run_count != 1) { nvtx_label = "run" + to_string(run) + "-" + nvtx_label; }
@@ -390,7 +391,7 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
                        this_neighbors + row * k,
                        this_distances + row * k,
                        stream);
-          ANN_CUDA_CHECK(cudaStreamSynchronize(stream));
+          RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
           float elapsed_ms = timer.elapsed_ms();
 #ifdef NVTX
           nvtxRangePop();
@@ -424,17 +425,18 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
         }
         search_times.clear();
       }
-      ANN_CUDA_CHECK_LAST_ERROR();
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
 
       if (algo_property.query_memory_type == MemoryType::Device) {
-        ANN_CUDA_CHECK(cudaMemcpy(neighbors,
-                                  d_neighbors,
-                                  query_set_size * k * sizeof(*d_neighbors),
-                                  cudaMemcpyDeviceToHost));
-        ANN_CUDA_CHECK(cudaMemcpy(distances,
-                                  d_distances,
-                                  query_set_size * k * sizeof(*d_distances),
-                                  cudaMemcpyDeviceToHost));
+        RAFT_CUDA_TRY(cudaMemcpy(neighbors,
+                                 d_neighbors,
+                                 query_set_size * k * sizeof(*d_neighbors),
+                                 cudaMemcpyDeviceToHost));
+        RAFT_CUDA_TRY(cudaMemcpy(distances,
+                                 d_distances,
+                                 query_set_size * k * sizeof(*d_distances),
+                                 cudaMemcpyDeviceToHost));
       }
 
       for (size_t j = 0; j < query_set_size * k; ++j) {
@@ -463,9 +465,9 @@ inline void search(const Dataset<T>* dataset, const std::vector<Configuration::I
   delete[] neighbors;
   delete[] neighbors_buf;
   delete[] distances;
-  ANN_CUDA_CHECK(cudaFree(d_neighbors));
-  ANN_CUDA_CHECK(cudaFree(d_distances));
-  ANN_CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaFree(d_neighbors));
+  RAFT_CUDA_TRY(cudaFree(d_distances));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 inline const std::string usage(const string& argv0)
diff --git a/cpp/bench/ann/src/common/cudart_util.h b/cpp/bench/ann/src/common/cudart_util.h
deleted file mode 100644
index 2c590a9523..0000000000
--- a/cpp/bench/ann/src/common/cudart_util.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef CUDART_UTIL_H_
-#define CUDART_UTIL_H_
-#include <stdexcept>
-#include <string>
-
-#include <cuda_runtime_api.h>
-
-#define ANN_CUDA_CHECK(call)                                   \
-  {                                                            \
-    raft::bench::ann::cuda_check_((call), __FILE__, __LINE__); \
-  }
-
-#ifndef NDEBUG
-#define ANN_CUDA_CHECK_LAST_ERROR()                               \
-  {                                                               \
-    raft::bench::ann::cuda_check_last_error_(__FILE__, __LINE__); \
-  }
-#else
-#define ANN_CUDA_CHECK_LAST_ERROR()
-#endif
-
-namespace raft::bench::ann {
-
-constexpr unsigned int WARP_FULL_MASK = 0xffffffff;
-constexpr int WARP_SIZE               = 32;
-
-class CudaException : public std::runtime_error {
- public:
-  explicit CudaException(const std::string& what) : runtime_error(what) {}
-};
-
-inline void cuda_check_(cudaError_t val, const char* file, int line)
-{
-  if (val != cudaSuccess) {
-    throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
-                        std::to_string(val) + ": " + cudaGetErrorName(val) + ": " +
-                        cudaGetErrorString(val));
-  }
-}
-
-inline void cuda_check_last_error_(const char* file, int line)
-{
-  cudaDeviceSynchronize();
-  cudaError_t err = cudaPeekAtLastError();
-  cuda_check_(err, file, line);
-}
-
-}  // namespace raft::bench::ann
-#endif
diff --git a/cpp/bench/ann/src/common/dataset.h b/cpp/bench/ann/src/common/dataset.h
index e427a2c9dd..4c19e6be8c 100644
--- a/cpp/bench/ann/src/common/dataset.h
+++ b/cpp/bench/ann/src/common/dataset.h
@@ -29,7 +29,7 @@
 #include <type_traits>
 #include <vector>
 
-#include "cudart_util.h"
+#include <raft/util/cudart_utils.hpp>
 
 namespace raft::bench::ann {
 
@@ -272,8 +272,8 @@ Dataset<T>::~Dataset()
 {
   delete[] base_set_;
   delete[] query_set_;
-  if (d_base_set_) { ANN_CUDA_CHECK(cudaFree(d_base_set_)); }
-  if (d_query_set_) { ANN_CUDA_CHECK(cudaFree(d_query_set_)); }
+  if (d_base_set_) { RAFT_CUDA_TRY(cudaFree(d_base_set_)); }
+  if (d_query_set_) { RAFT_CUDA_TRY(cudaFree(d_query_set_)); }
 }
 
 template <typename T>
@@ -281,8 +281,8 @@ const T* Dataset<T>::base_set_on_gpu() const
 {
   if (!d_base_set_) {
     base_set();
-    ANN_CUDA_CHECK(cudaMalloc((void**)&d_base_set_, base_set_size_ * dim_ * sizeof(T)));
-    ANN_CUDA_CHECK(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMalloc((void**)&d_base_set_, base_set_size_ * dim_ * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMemcpy(
       d_base_set_, base_set_, base_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
   }
   return d_base_set_;
@@ -293,8 +293,8 @@ const T* Dataset<T>::query_set_on_gpu() const
 {
   if (!d_query_set_) {
     query_set();
-    ANN_CUDA_CHECK(cudaMalloc((void**)&d_query_set_, query_set_size_ * dim_ * sizeof(T)));
-    ANN_CUDA_CHECK(cudaMemcpy(
+    RAFT_CUDA_TRY(cudaMalloc((void**)&d_query_set_, query_set_size_ * dim_ * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMemcpy(
       d_query_set_, query_set_, query_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
   }
   return d_query_set_;
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
index 1730bd0683..8e3eaab6ab 100644
--- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -14,6 +14,136 @@
  * limitations under the License.
  */
 #include "../common/benchmark.hpp"
-#include "faiss_benchmark.cuh"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann.hpp"
+#undef WARP_SIZE
+#include "faiss_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  param.M     = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.usePrecomputed = conf.at("usePrecomputed");
+  } else {
+    param.usePrecomputed = false;
+  }
+  if (conf.contains("useFloat16")) {
+    param.useFloat16 = conf.at("useFloat16");
+  } else {
+    param.useFloat16 = false;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
+{
+  param.nlist          = conf.at("nlist");
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::FaissGpu<T>::SearchParam& param)
+{
+  param.nprobe = conf.at("nprobe");
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "faiss_gpu_ivf_flat") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
+    } else if (algo == "faiss_gpu_ivf_pq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFPQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_ivf_sq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFSQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_flat") {
+      ann = std::make_unique<raft::bench::ann::FaissGpuFlat<T>>(metric, dim);
+    }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
+    auto param = std::make_unique<typename raft::bench::ann::FaissGpu<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo == "faiss_gpu_flat") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cuh b/cpp/bench/ann/src/faiss/faiss_benchmark.cuh
deleted file mode 100644
index 01a0cadeaa..0000000000
--- a/cpp/bench/ann/src/faiss/faiss_benchmark.cuh
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "../common/ann.hpp"
-#undef WARP_SIZE
-#include "faiss_wrapper.h"
-#define JSON_DIAGNOSTICS 1
-#include <nlohmann/json.hpp>
-
-namespace raft::bench::ann {
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
-{
-  param.nlist = conf.at("nlist");
-}
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
-{
-  param.nlist = conf.at("nlist");
-  param.M     = conf.at("M");
-  if (conf.contains("usePrecomputed")) {
-    param.usePrecomputed = conf.at("usePrecomputed");
-  } else {
-    param.usePrecomputed = false;
-  }
-  if (conf.contains("useFloat16")) {
-    param.useFloat16 = conf.at("useFloat16");
-  } else {
-    param.useFloat16 = false;
-  }
-}
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
-{
-  param.nlist          = conf.at("nlist");
-  param.quantizer_type = conf.at("quantizer_type");
-}
-
-template <typename T>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::FaissGpu<T>::SearchParam& param)
-{
-  param.nprobe = conf.at("nprobe");
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf,
-                                                    const std::vector<int>& dev_list)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-
-  (void)dev_list;
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T>
-std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
-                                                      const std::string& distance,
-                                                      int dim,
-                                                      float refine_ratio,
-                                                      const nlohmann::json& conf,
-                                                      const std::vector<int>& dev_list)
-{
-  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
-  (void)dev_list;
-
-  raft::bench::ann::Metric metric = parse_metric(distance);
-  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
-
-  if constexpr (std::is_same_v<T, float>) {
-    if (algo == "faiss_gpu_ivf_flat") {
-      ann = make_algo<T, raft::bench::ann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
-    } else if (algo == "faiss_gpu_ivf_pq") {
-      ann = make_algo<T, raft::bench::ann::FaissGpuIVFPQ>(metric, dim, conf);
-    } else if (algo == "faiss_gpu_ivf_sq") {
-      ann = make_algo<T, raft::bench::ann::FaissGpuIVFSQ>(metric, dim, conf);
-    } else if (algo == "faiss_gpu_flat") {
-      ann = std::make_unique<raft::bench::ann::FaissGpuFlat<T>>(metric, dim);
-    }
-  }
-
-  if constexpr (std::is_same_v<T, uint8_t>) {}
-
-  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
-
-  if (refine_ratio > 1.0) {}
-  return ann;
-}
-
-template <typename T>
-std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
-  const std::string& algo, const nlohmann::json& conf)
-{
-  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
-    auto param = std::make_unique<typename raft::bench::ann::FaissGpu<T>::SearchParam>();
-    parse_search_param<T>(conf, *param);
-    return param;
-  } else if (algo == "faiss_gpu_flat") {
-    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
-    return param;
-  }
-  // else
-  throw std::runtime_error("invalid algo: '" + algo + "'");
-}
-
-}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
index 0ef90c9fc6..6c2b3d6cd0 100644
--- a/cpp/bench/ann/src/faiss/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -37,7 +37,7 @@
 
 #include "../common/ann.hpp"
 #include "../common/benchmark_util.hpp"
-#include "../common/cudart_util.h"
+#include <raft/util/cudart_utils.hpp>
 
 namespace {
 
@@ -127,7 +127,7 @@ FaissGpu<T>::FaissGpu(Metric metric, int dim, int nlist)
   : ANN<T>(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist)
 {
   static_assert(std::is_same_v<T, float>, "faiss support only float type");
-  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
 }
 
 template <typename T>
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index 87a4872125..ab9b815305 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -15,6 +15,112 @@
  */
 
 #include "../common/benchmark.hpp"
-#include "ggnn_benchmark.cuh"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann.hpp"
+#undef WARP_SIZE
+#include "ggnn_wrapper.cuh"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::Ggnn<T>::BuildParam& param)
+{
+  param.dataset_size = conf.at("dataset_size");
+  param.k            = conf.at("k");
+
+  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
+  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
+  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
+  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
+  if (conf.contains("refine_iterations")) {
+    param.refine_iterations = conf.at("refine_iterations");
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::Ggnn<T>::SearchParam& param)
+{
+  param.tau = conf.at("tau");
+
+  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
+  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {}
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (algo == "ggnn") { ann = make_algo<T, raft::bench::ann::Ggnn>(metric, dim, conf); }
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "ggnn") {
+    auto param = std::make_unique<typename raft::bench::ann::Ggnn<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh
deleted file mode 100644
index a5b6957c0c..0000000000
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "../common/ann.hpp"
-#undef WARP_SIZE
-#include "ggnn_wrapper.cuh"
-#define JSON_DIAGNOSTICS 1
-#include <nlohmann/json.hpp>
-
-namespace raft::bench::ann {
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::Ggnn<T>::BuildParam& param)
-{
-  param.dataset_size = conf.at("dataset_size");
-  param.k            = conf.at("k");
-
-  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
-  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
-  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
-  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
-  if (conf.contains("refine_iterations")) {
-    param.refine_iterations = conf.at("refine_iterations");
-  }
-}
-
-template <typename T>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::Ggnn<T>::SearchParam& param)
-{
-  param.tau = conf.at("tau");
-
-  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
-  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
-  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
-  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf,
-                                                    const std::vector<int>& dev_list)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-
-  (void)dev_list;
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T>
-std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
-                                                      const std::string& distance,
-                                                      int dim,
-                                                      float refine_ratio,
-                                                      const nlohmann::json& conf,
-                                                      const std::vector<int>& dev_list)
-{
-  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
-  (void)dev_list;
-
-  raft::bench::ann::Metric metric = parse_metric(distance);
-  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
-
-  if constexpr (std::is_same_v<T, float>) {}
-
-  if constexpr (std::is_same_v<T, uint8_t>) {}
-
-  if (algo == "ggnn") { ann = make_algo<T, raft::bench::ann::Ggnn>(metric, dim, conf); }
-  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
-
-  if (refine_ratio > 1.0) {}
-  return ann;
-}
-
-template <typename T>
-std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
-  const std::string& algo, const nlohmann::json& conf)
-{
-  if (algo == "ggnn") {
-    auto param = std::make_unique<typename raft::bench::ann::Ggnn<T>::SearchParam>();
-    parse_search_param<T>(conf, *param);
-    return param;
-  }
-  // else
-  throw std::runtime_error("invalid algo: '" + algo + "'");
-}
-
-}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index 3d1d83aeb1..de497ead02 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -21,8 +21,8 @@
 
 #include "../common/ann.hpp"
 #include "../common/benchmark_util.hpp"
-#include "../common/cudart_util.h"
 #include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
+#include <raft/util/cudart_utils.hpp>
 
 namespace raft::bench::ann {
 
@@ -184,7 +184,7 @@ GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
   if (dim != D) { throw std::runtime_error("mis-matched dim"); }
 
   int device;
-  ANN_CUDA_CHECK(cudaGetDevice(&device));
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
 
   ggnn_ = std::make_unique<GGNNGPUInstance>(
     device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
@@ -242,7 +242,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(const T* queries,
   }
 
   ggnn_->set_stream(stream);
-  ANN_CUDA_CHECK(cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float)));
+  RAFT_CUDA_TRY(cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float)));
 
   const int block_dim      = search_param_.block_dim;
   const int max_iterations = search_param_.max_iterations;
@@ -289,7 +289,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::save(const std::string& file) c
   ggnn_->set_stream(0);
 
   ggnn_host.downloadAsync(ggnn_device);
-  ANN_CUDA_CHECK(cudaStreamSynchronize(ggnn_device.stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream));
   ggnn_host.store(file);
 }
 
@@ -302,7 +302,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
 
   ggnn_host.load(file);
   ggnn_host.uploadAsync(ggnn_device);
-  ANN_CUDA_CHECK(cudaStreamSynchronize(ggnn_device.stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream));
 }
 
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index 75192d68a5..8832f31567 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -13,7 +13,107 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "hnswlib_benchmark.hpp"
 #include "../common/benchmark.hpp"
 
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/benchmark_util.hpp"
+
+#include "../common/ann.hpp"
+#undef WARP_SIZE
+#include "hnswlib_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::HnswLib<T>::BuildParam& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.M               = conf.at("M");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::HnswLib<T>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
+  }
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "hnswlib") {
+    auto param = std::make_unique<typename raft::bench::ann::HnswLib<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+};  // namespace raft::bench::ann
+
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp
deleted file mode 100644
index 6d3851799e..0000000000
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "../common/benchmark_util.hpp"
-
-#include "../common/ann.hpp"
-#undef WARP_SIZE
-#include "hnswlib_wrapper.h"
-#define JSON_DIAGNOSTICS 1
-#include <nlohmann/json.hpp>
-
-namespace raft::bench::ann {
-
-template <typename T>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::HnswLib<T>::BuildParam& param)
-{
-  param.ef_construction = conf.at("efConstruction");
-  param.M               = conf.at("M");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
-}
-
-template <typename T>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::HnswLib<T>::SearchParam& param)
-{
-  param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf,
-                                                    const std::vector<int>& dev_list)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-
-  (void)dev_list;
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T>
-std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
-                                                      const std::string& distance,
-                                                      int dim,
-                                                      float refine_ratio,
-                                                      const nlohmann::json& conf,
-                                                      const std::vector<int>& dev_list)
-{
-  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
-  (void)dev_list;
-
-  raft::bench::ann::Metric metric = parse_metric(distance);
-  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
-
-  if constexpr (std::is_same_v<T, float>) {
-    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
-  }
-
-  if constexpr (std::is_same_v<T, uint8_t>) {
-    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
-  }
-
-  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
-
-  if (refine_ratio > 1.0) {}
-  return ann;
-}
-
-template <typename T>
-std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
-  const std::string& algo, const nlohmann::json& conf)
-{
-  if (algo == "hnswlib") {
-    auto param = std::make_unique<typename raft::bench::ann::HnswLib<T>::SearchParam>();
-    parse_search_param<T>(conf, *param);
-    return param;
-  }
-  // else
-  throw std::runtime_error("invalid algo: '" + algo + "'");
-}
-
-};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index fa8cc4b824..c961683aa6 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -13,9 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef RAFT_CUANN_UTILS_H_
-#define RAFT_CUANN_UTILS_H_
-
+#pragma once
 #include <cassert>
 #include <fstream>
 #include <iostream>
@@ -45,5 +43,3 @@ inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric m
   }
 }
 }  // namespace raft::bench::ann
-
-#endif
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index 8ec375fc02..ddb428ae53 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -14,6 +14,203 @@
  * limitations under the License.
  */
 #include "../common/benchmark.hpp"
-#include "raft_benchmark.cuh"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann.hpp"
+#include "../common/benchmark_util.hpp"
+#undef WARP_SIZE
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#include "raft_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+#include "raft_ivf_flat_wrapper.h"
+extern template class raft::bench::ann::RaftIvfFlatGpu<float, uint64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, uint64_t>;
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+#include "raft_ivf_pq_wrapper.h"
+extern template class raft::bench::ann::RaftIvfPQ<float, uint64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<uint8_t, uint64_t>;
+#endif
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
+    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
+{
+  param.pq_param.n_probes = conf.at("numProbes");
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+}
+#endif
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+    if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
+    parse_build_param<T, uint64_t>(conf, param);
+    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::BuildParam param;
+    parse_build_param<T, uint64_t>(conf, param);
+    ann =
+      std::make_unique<raft::bench::ann::RaftIvfPQ<T, uint64_t>>(metric, dim, param, refine_ratio);
+  }
+#endif
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+  if (algo == "raft_bfknn") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    auto param =
+      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
+    parse_search_param<T, uint64_t>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::SearchParam>();
+    parse_search_param<T, uint64_t>(conf, *param);
+    return param;
+  }
+#endif
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cuh b/cpp/bench/ann/src/raft/raft_benchmark.cuh
deleted file mode 100644
index 4bee91db7b..0000000000
--- a/cpp/bench/ann/src/raft/raft_benchmark.cuh
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "../common/ann.hpp"
-#include "../common/benchmark_util.hpp"
-#undef WARP_SIZE
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-#include "raft_wrapper.h"
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-#include "raft_ivf_flat_wrapper.h"
-extern template class raft::bench::ann::RaftIvfFlatGpu<float, uint64_t>;
-extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, uint64_t>;
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-#include "raft_ivf_pq_wrapper.h"
-extern template class raft::bench::ann::RaftIvfPQ<float, uint64_t>;
-extern template class raft::bench::ann::RaftIvfPQ<uint8_t, uint64_t>;
-#endif
-#define JSON_DIAGNOSTICS 1
-#include <nlohmann/json.hpp>
-
-namespace raft::bench::ann {
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
-{
-  param.n_lists = conf.at("nlist");
-  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) {
-    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
-    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
-  }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
-{
-  param.ivf_flat_params.n_probes = conf.at("nprobe");
-}
-#endif
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
-{
-  param.n_lists = conf.at("nlist");
-  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
-  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
-  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
-{
-  param.pq_param.n_probes = conf.at("numProbes");
-  if (conf.contains("internalDistanceDtype")) {
-    std::string type = conf.at("internalDistanceDtype");
-    if (type == "float") {
-      param.pq_param.internal_distance_dtype = CUDA_R_32F;
-    } else if (type == "half") {
-      param.pq_param.internal_distance_dtype = CUDA_R_16F;
-    } else {
-      throw std::runtime_error("internalDistanceDtype: '" + type +
-                               "', should be either 'float' or 'half'");
-    }
-  } else {
-    // set half as default type
-    param.pq_param.internal_distance_dtype = CUDA_R_16F;
-  }
-
-  if (conf.contains("smemLutDtype")) {
-    std::string type = conf.at("smemLutDtype");
-    if (type == "float") {
-      param.pq_param.lut_dtype = CUDA_R_32F;
-    } else if (type == "half") {
-      param.pq_param.lut_dtype = CUDA_R_16F;
-    } else if (type == "fp8") {
-      param.pq_param.lut_dtype = CUDA_R_8U;
-    } else {
-      throw std::runtime_error("smemLutDtype: '" + type +
-                               "', should be either 'float', 'half' or 'fp8'");
-    }
-  } else {
-    // set half as default
-    param.pq_param.lut_dtype = CUDA_R_16F;
-  }
-}
-#endif
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T, template <typename> class Algo>
-std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
-                                                    int dim,
-                                                    const nlohmann::json& conf,
-                                                    const std::vector<int>& dev_list)
-{
-  typename Algo<T>::BuildParam param;
-  parse_build_param<T>(conf, param);
-
-  (void)dev_list;
-  return std::make_unique<Algo<T>>(metric, dim, param);
-}
-
-template <typename T>
-std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
-                                                      const std::string& distance,
-                                                      int dim,
-                                                      float refine_ratio,
-                                                      const nlohmann::json& conf,
-                                                      const std::vector<int>& dev_list)
-{
-  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
-  (void)dev_list;
-
-  raft::bench::ann::Metric metric = parse_metric(distance);
-  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
-
-  if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-    if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
-#endif
-  }
-
-  if constexpr (std::is_same_v<T, uint8_t>) {}
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-  if (algo == "raft_ivf_flat") {
-    typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
-    parse_build_param<T, uint64_t>(conf, param);
-    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-  if (algo == "raft_ivf_pq") {
-    typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::BuildParam param;
-    parse_build_param<T, uint64_t>(conf, param);
-    ann =
-      std::make_unique<raft::bench::ann::RaftIvfPQ<T, uint64_t>>(metric, dim, param, refine_ratio);
-  }
-#endif
-  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
-
-  if (refine_ratio > 1.0) {}
-  return ann;
-}
-
-template <typename T>
-std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
-  const std::string& algo, const nlohmann::json& conf)
-{
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-  if (algo == "raft_bfknn") {
-    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
-    return param;
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-  if (algo == "raft_ivf_flat") {
-    auto param =
-      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
-    parse_search_param<T, uint64_t>(conf, *param);
-    return param;
-  }
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-  if (algo == "raft_ivf_pq") {
-    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::SearchParam>();
-    parse_search_param<T, uint64_t>(conf, *param);
-    return param;
-  }
-#endif
-  // else
-  throw std::runtime_error("invalid algo: '" + algo + "'");
-}
-
-}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 1004b3d184..e579f31dfe 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef RAFT_IVF_FLAT_WRAPPER_H_
-#define RAFT_IVF_FLAT_WRAPPER_H_
+#pragma once
 
 #include <cassert>
 #include <fstream>
@@ -36,8 +35,8 @@
 #include <type_traits>
 
 #include "../common/ann.hpp"
-#include "../common/cudart_util.h"
 #include "raft_ann_bench_utils.h"
+#include <raft/util/cudart_utils.hpp>
 
 namespace raft::bench::ann {
 
@@ -98,7 +97,7 @@ RaftIvfFlatGpu<T, IdxT>::RaftIvfFlatGpu(Metric metric, int dim, const BuildParam
     mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
 {
   index_params_.metric = parse_metric_type(metric);
-  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
 }
 
 template <typename T, typename IdxT>
@@ -143,4 +142,3 @@ void RaftIvfFlatGpu<T, IdxT>::search(
   return;
 }
 }  // namespace raft::bench::ann
-#endif
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index bbe446aed9..89e8cc73b1 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef RAFT_IVF_PQ_WRAPPER_H_
-#define RAFT_IVF_PQ_WRAPPER_H_
+#pragma once
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -34,8 +33,8 @@
 #include <type_traits>
 
 #include "../common/ann.hpp"
-#include "../common/cudart_util.h"
 #include "raft_ann_bench_utils.h"
+#include <raft/util/cudart_utils.hpp>
 
 namespace raft::bench::ann {
 
@@ -99,7 +98,7 @@ RaftIvfPQ<T, IdxT>::RaftIvfPQ(Metric metric, int dim, const BuildParam& param, f
     mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
 {
   index_params_.metric = parse_metric_type(metric);
-  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
 }
 
 template <typename T, typename IdxT>
@@ -221,5 +220,3 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
   return;
 }
 }  // namespace raft::bench::ann
-
-#endif
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index 5e538fdd23..a5a854f2a6 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef RAFT_WRAPPER_H_
-#define RAFT_WRAPPER_H_
+#pragma once
 
 #include <cassert>
 #include <memory>
@@ -91,7 +90,7 @@ RaftGpu<T>::RaftGpu(Metric metric, int dim)
 {
   static_assert(std::is_same_v<T, float>, "raft support only float type");
   assert(metric_type_ == raft::distance::DistanceType::L2Expanded);
-  ANN_CUDA_CHECK(cudaGetDevice(&device_));
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
 }
 
 template <typename T>
@@ -151,5 +150,3 @@ void RaftGpu<T>::search(const T* queries,
 }
 
 }  // namespace raft::bench::ann
-
-#endif
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 0feb188ad8..d40884edc9 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -77,11 +77,6 @@ struct cuda_error : public raft::exception {
     }                                              \
   } while (0)
 
-// FIXME: Remove after consumers rename
-#ifndef CUDA_TRY
-#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
-#endif
-
 /**
  * @brief Debug macro to check for CUDA errors
  *
@@ -101,16 +96,6 @@ struct cuda_error : public raft::exception {
 #define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
 #endif
 
-// FIXME: Remove after consumers rename
-#ifndef CHECK_CUDA
-#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
-#endif
-
 // /**
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
@@ -127,17 +112,6 @@ struct cuda_error : public raft::exception {
     }                                                              \
   } while (0)
 
-// FIXME: Remove after cuml rename
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
-#endif
-
-/**
- * Alias to raft scope for now.
- * TODO: Rename original implementations in 22.04 to fix
- * https://github.com/rapidsai/raft/issues/128
- */
-
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
@@ -249,7 +223,7 @@ class grid_1d_block_t {
 template <typename Type>
 void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
 {
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**

From 59438fa201e811571db8631d2ab02290967c5974 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Mar 2023 22:03:21 -0500
Subject: [PATCH 14/39] More cleanup

---
 cpp/CMakeLists.txt                              |  1 -
 cpp/bench/ann/src/common/conf.h                 |  6 +-----
 cpp/bench/ann/src/common/dataset.h              | 10 +++-------
 cpp/bench/ann/src/common/util.h                 |  5 +----
 cpp/bench/ann/src/faiss/faiss_benchmark.cu      |  3 ++-
 cpp/bench/ann/src/ggnn/ggnn_benchmark.cu        |  4 ++--
 cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp |  3 ++-
 cpp/bench/ann/src/raft/raft_benchmark.cu        |  5 +++--
 cpp/include/raft/util/cudart_utils.hpp          | 12 +-----------
 9 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bc7fd3ea63..8127d0fdf5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -745,6 +745,5 @@ endif()
 # * build ann benchmark executable -----------------------------------------------
 
 if(BUILD_ANN_BENCH)
-  message("GOT HERE!")
   include(bench/ann/CMakeLists.txt)
 endif()
diff --git a/cpp/bench/ann/src/common/conf.h b/cpp/bench/ann/src/common/conf.h
index cdf8968d8d..c498a93ca1 100644
--- a/cpp/bench/ann/src/common/conf.h
+++ b/cpp/bench/ann/src/common/conf.h
@@ -13,9 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef CONF_H_
-#define CONF_H_
-
+#pragma once
 #include <iostream>
 #include <string>
 #include <unordered_set>
@@ -72,5 +70,3 @@ class Configuration {
 };
 
 }  // namespace raft::bench::ann
-
-#endif
diff --git a/cpp/bench/ann/src/common/dataset.h b/cpp/bench/ann/src/common/dataset.h
index 4c19e6be8c..1244935c99 100644
--- a/cpp/bench/ann/src/common/dataset.h
+++ b/cpp/bench/ann/src/common/dataset.h
@@ -13,9 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef DATASET_H_
-#define DATASET_H_
-
+#pragma once
 #include <cuda_fp16.h>
 #include <errno.h>
 #include <sys/mman.h>
@@ -272,8 +270,8 @@ Dataset<T>::~Dataset()
 {
   delete[] base_set_;
   delete[] query_set_;
-  if (d_base_set_) { RAFT_CUDA_TRY(cudaFree(d_base_set_)); }
-  if (d_query_set_) { RAFT_CUDA_TRY(cudaFree(d_query_set_)); }
+  if (d_base_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_base_set_)); }
+  if (d_query_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_query_set_)); }
 }
 
 template <typename T>
@@ -381,5 +379,3 @@ void BinDataset<T>::map_base_set_() const
 }
 
 }  // namespace  raft::bench::ann
-
-#endif
diff --git a/cpp/bench/ann/src/common/util.h b/cpp/bench/ann/src/common/util.h
index 0599dc3bf2..290bf4cea9 100644
--- a/cpp/bench/ann/src/common/util.h
+++ b/cpp/bench/ann/src/common/util.h
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef UTIL_H_
-#define UTIL_H_
+#pragma once
 
 #include <chrono>
 #include <cstdio>
@@ -78,5 +77,3 @@ void log_error(Ts... vs)
 }
 
 }  // namespace raft::bench::ann
-
-#endif
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
index 8e3eaab6ab..188dbe1d25 100644
--- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../common/benchmark.hpp"
 
 #include <algorithm>
 #include <cmath>
@@ -146,4 +145,6 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
 
 }  // namespace raft::bench::ann
 
+#include "../common/benchmark.hpp"
+
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index ab9b815305..f7c275a1d8 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include "../common/benchmark.hpp"
-
 #include <algorithm>
 #include <cmath>
 #include <memory>
@@ -123,4 +121,6 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
 
 }  // namespace raft::bench::ann
 
+#include "../common/benchmark.hpp"
+
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index 8832f31567..fb056f395f 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../common/benchmark.hpp"
 
 #include <algorithm>
 #include <cmath>
@@ -116,4 +115,6 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
 
 };  // namespace raft::bench::ann
 
+#include "../common/benchmark.hpp"
+
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index ddb428ae53..e2287bce35 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../common/benchmark.hpp"
 
 #include <algorithm>
 #include <cmath>
@@ -211,6 +210,8 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
   throw std::runtime_error("invalid algo: '" + algo + "'");
 }
 
-}  // namespace raft::bench::ann
+};  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
 
 int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index d40884edc9..2854186506 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -14,14 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cudart_utils.hpp instead.
- */
-
-#ifndef __RAFT_RT_CUDART_UTILS_H
-#define __RAFT_RT_CUDART_UTILS_H
-
 #pragma once
 
 #include <raft/core/error.hpp>
@@ -31,7 +23,7 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_fp16.h>
-#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
 
 #include <chrono>
 #include <cstdio>
@@ -540,5 +532,3 @@ inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_
 }
 
 }  // namespace raft
-
-#endif

From 52dde57fda170fbc0508705122e04341a56397cd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Mar 2023 22:38:34 -0500
Subject: [PATCH 15/39] More cleanup

---
 cpp/bench/ann/CMakeLists.txt             | 1 -
 cpp/bench/ann/src/ggnn/ggnn_benchmark.cu | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 9950d99655..950d83fe13 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -41,7 +41,6 @@ if(RAFT_ANN_BENCH_USE_RAFT_BFKNN
 endif()
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
-  message("Using hnswlib")
   include(cmake/thirdparty/get_hnswlib.cmake)
 endif()
 
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index f7c275a1d8..5d74bf059d 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -23,7 +23,6 @@
 #include <utility>
 
 #include "../common/ann.hpp"
-#undef WARP_SIZE
 #include "ggnn_wrapper.cuh"
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>

From fcb2e734438a646004191a8ef8e6cc44ab0faf4a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 7 Mar 2023 14:36:54 -0500
Subject: [PATCH 16/39] Fixing a few more stray macros

---
 cpp/include/raft/spectral/detail/matrix_wrappers.hpp | 2 +-
 cpp/include/raft/util/cudart_utils.hpp               | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index e32b718117..73518e20ef 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -352,7 +352,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     // scales y by beta:
     //
     if (beta == 0) {
-      CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
+      RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
       // TODO: Call from public API when ready
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 2854186506..252ca8b3ff 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -241,7 +241,8 @@ void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_vi
 template <typename Type>
 void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
 {
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -270,7 +271,7 @@ void print_device_vector(const char* variable_name,
                          OutStream& out)
 {
   auto host_mem = std::make_unique<T[]>(componentsCount);
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpy(host_mem.get(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem.get(), componentsCount, out);
 }

From 5c734bd2d9a74b2215b8f4144488f621c41a0207 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 7 Mar 2023 17:07:11 -0500
Subject: [PATCH 17/39] Renaming cuann

---
 dependencies.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index cfabfa4712..e4971c21db 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -22,7 +22,7 @@ files:
       - cudatoolkit
       - py_version
       - test_python
-  cuann_bench:
+  bench_ann:
     output: none
     includes:
       - cudatoolkit

From d6647bc07232223ec3baf5d1be8d2730f2f41503 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 9 Mar 2023 15:04:38 -0500
Subject: [PATCH 18/39] Doing some renaming

---
 cpp/bench/ann/src/common/{ann.hpp => ann_types.hpp} | 0
 cpp/bench/ann/src/common/benchmark_util.hpp         | 2 +-
 cpp/bench/ann/src/faiss/faiss_benchmark.cu          | 2 +-
 cpp/bench/ann/src/faiss/faiss_wrapper.h             | 2 +-
 cpp/bench/ann/src/ggnn/ggnn_benchmark.cu            | 2 +-
 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh             | 2 +-
 cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp     | 2 +-
 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h         | 2 +-
 cpp/bench/ann/src/raft/raft_benchmark.cu            | 2 +-
 cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h      | 2 +-
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h        | 2 +-
 cpp/bench/ann/src/raft/raft_wrapper.h               | 2 +-
 12 files changed, 11 insertions(+), 11 deletions(-)
 rename cpp/bench/ann/src/common/{ann.hpp => ann_types.hpp} (100%)

diff --git a/cpp/bench/ann/src/common/ann.hpp b/cpp/bench/ann/src/common/ann_types.hpp
similarity index 100%
rename from cpp/bench/ann/src/common/ann.hpp
rename to cpp/bench/ann/src/common/ann_types.hpp
diff --git a/cpp/bench/ann/src/common/benchmark_util.hpp b/cpp/bench/ann/src/common/benchmark_util.hpp
index 52bd193042..7005883ffc 100644
--- a/cpp/bench/ann/src/common/benchmark_util.hpp
+++ b/cpp/bench/ann/src/common/benchmark_util.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include "ann.hpp"
+#include "ann_types.hpp"
 #include <string>
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
index 188dbe1d25..294da9a14f 100644
--- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -22,7 +22,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #undef WARP_SIZE
 #include "faiss_wrapper.h"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
index 6c2b3d6cd0..8cfc26ea5b 100644
--- a/cpp/bench/ann/src/faiss/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -35,7 +35,7 @@
 #include <string>
 #include <type_traits>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include "../common/benchmark_util.hpp"
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index 5d74bf059d..8072cd857c 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -22,7 +22,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include "ggnn_wrapper.cuh"
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index de497ead02..fd8fe0f2ec 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -19,7 +19,7 @@
 #include <memory>
 #include <stdexcept>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include "../common/benchmark_util.hpp"
 #include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index fb056f395f..cd823e8a69 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -24,7 +24,7 @@
 
 #include "../common/benchmark_util.hpp"
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #undef WARP_SIZE
 #include "hnswlib_wrapper.h"
 #define JSON_DIAGNOSTICS 1
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 379a0dc482..c5c3a4a2a6 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -31,7 +31,7 @@
 #include <utility>
 #include <vector>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include <hnswlib.h>
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index e2287bce35..b28c502a2c 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -22,7 +22,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include "../common/benchmark_util.hpp"
 #undef WARP_SIZE
 #ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index e579f31dfe..bf6d37ed59 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -34,7 +34,7 @@
 #include <string>
 #include <type_traits>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include "raft_ann_bench_utils.h"
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 89e8cc73b1..5a0c4dd2b5 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -32,7 +32,7 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <type_traits>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 #include "raft_ann_bench_utils.h"
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index a5a854f2a6..377bb925b7 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <type_traits>
 
-#include "../common/ann.hpp"
+#include "../common/ann_types.hpp"
 
 namespace raft_temp {
 

From e8919244f45e36aafaba4b8fa52d97140a0bbcb9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 16 Mar 2023 06:07:40 -0400
Subject: [PATCH 19/39] Update cpp/bench/ann/conf/deep-100M.json

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/bench/ann/conf/deep-100M.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/bench/ann/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json
index 36b42aba92..ade590d35c 100644
--- a/cpp/bench/ann/conf/deep-100M.json
+++ b/cpp/bench/ann/conf/deep-100M.json
@@ -5,7 +5,7 @@
     "subset_size" : 100000000,
     "query_file" : "data/deep-1B/query.public.10K.fbin",
     // although distance should be "euclidean", faiss becomes much slower for that
-    "distance" : "inner_product"
+    "distance" : "euclidean"
   },
 
   "search_basic_param" : {

From 122851983e78dc2d7d473a947172ddc64235aa4f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 23 Mar 2023 16:52:26 -0400
Subject: [PATCH 20/39] Updates based on review feedback

---
 build.sh                                      |  8 +-
 cpp/bench/ann/CMakeLists.txt                  | 31 +++---
 cpp/bench/ann/scripts/hdf5_to_fbin.py         | 97 ++++++++++---------
 cpp/bench/ann/src/common/benchmark.hpp        | 91 +++++++++++------
 cpp/bench/ann/src/common/conf.cpp             | 15 +++
 cpp/bench/ann/src/common/conf.h               |  3 +
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h |  3 +-
 cpp/bench/ann/src/raft/raft_benchmark.cu      | 34 ++++---
 cpp/bench/ann/src/raft/raft_ivf_flat.cu       |  9 +-
 .../ann/src/raft/raft_ivf_flat_wrapper.h      |  7 +-
 cpp/bench/ann/src/raft/raft_ivf_pq.cu         |  9 +-
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h  | 53 +++++-----
 cpp/bench/ann/src/raft/raft_wrapper.h         |  2 +-
 cpp/bench/prims/CMakeLists.txt                |  2 +
 .../distance/tune_pairwise/bench.cu           |  0
 .../distance/tune_pairwise/kernel.cu          |  0
 .../distance/tune_pairwise/kernel.cuh         |  0
 cpp/cmake/thirdparty/get_faiss.cmake          | 87 +++++++++++++++++
 cpp/cmake/thirdparty/get_nlohmann_json.cmake  |  4 +-
 dependencies.yaml                             |  3 +
 20 files changed, 299 insertions(+), 159 deletions(-)
 rename cpp/bench/{ => prims}/distance/tune_pairwise/bench.cu (100%)
 rename cpp/bench/{ => prims}/distance/tune_pairwise/kernel.cu (100%)
 rename cpp/bench/{ => prims}/distance/tune_pairwise/kernel.cuh (100%)
 create mode 100644 cpp/cmake/thirdparty/get_faiss.cmake

diff --git a/build.sh b/build.sh
index d08bc87e2e..b6a6567cdc 100755
--- a/build.sh
+++ b/build.sh
@@ -315,13 +315,7 @@ fi
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
     BUILD_ANN_BENCH=ON
     CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
-
-    # Force compile nn library when needed benchmark targets are specified
-    if [[ $CMAKE_TARGET == *"_RAFT_"* ]]; then
-      ENABLE_NN_DEPENDENCIES=ON
-      COMPILE_DIST_LIBRARY=ON
-      COMPILE_NN_LIBRARY=ON
-    fi
+    COMPILE_LIBRARY=ON
 fi
 
 if hasArg --no-nvtx; then
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 950d83fe13..f3fbb84646 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -56,19 +56,19 @@ if(RAFT_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss.cmake)
 endif()
 
-function(ConfigureCuannBench)
+function(ConfigureAnnBench)
 
   set(oneValueArgs NAME)
   set(multiValueArgs PATH LINKS CXXFLAGS INCLUDES)
 
   cmake_parse_arguments(
-    ConfigureCuannBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
+    ConfigureAnnBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
   )
 
-  set(BENCH_NAME ${ConfigureCuannBench_NAME}_ANN_BENCH)
+  set(BENCH_NAME ${ConfigureAnnBench_NAME}_ANN_BENCH)
 
   add_executable(
-    ${BENCH_NAME} ${ConfigureCuannBench_PATH} bench/ann/src/common/conf.cpp
+    ${BENCH_NAME} ${ConfigureAnnBench_PATH} bench/ann/src/common/conf.cpp
                   bench/ann/src/common/util.cpp
   )
   target_link_libraries(
@@ -76,7 +76,7 @@ function(ConfigureCuannBench)
     PRIVATE raft::raft
             nlohmann_json::nlohmann_json
             $<$<BOOL:${RAFT_ANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
-            ${ConfigureCuannBench_LINKS}
+            ${ConfigureAnnBench_LINKS}
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
@@ -94,25 +94,25 @@ function(ConfigureCuannBench)
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  set(${ConfigureCuannBench_CXXFLAGS} ${RAFT_CXX_FLAGS} ${ConfigureCuannBench_CXXFLAGS})
+  set(${ConfigureAnnBench_CXXFLAGS} ${RAFT_CXX_FLAGS} ${ConfigureAnnBench_CXXFLAGS})
 
   target_compile_options(
-    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ConfigureCuannBench_CXXFLAGS}>"
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ConfigureAnnBench_CXXFLAGS}>"
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  if(RAFT_ANN_BENCH_USE_${ConfigureCuannBench_NAME})
+  if(RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME})
     target_compile_definitions(
       ${BENCH_NAME}
       PUBLIC
-        RAFT_ANN_BENCH_USE_${ConfigureCuannBench_NAME}=RAFT_ANN_BENCH_USE_${ConfigureCuannBench_NAME}
+        RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}=RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}
     )
   endif()
 
   target_include_directories(
     ${BENCH_NAME}
     PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-    PRIVATE ${ConfigureCuannBench_INCLUDES}
+    PRIVATE ${ConfigureAnnBench_INCLUDES}
   )
 
   install(
@@ -124,14 +124,14 @@ function(ConfigureCuannBench)
 endfunction()
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
-  ConfigureCuannBench(
+  ConfigureAnnBench(
     NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS -mavx
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT)
-  ConfigureCuannBench(
+  ConfigureAnnBench(
     NAME
     RAFT_IVF_PQ
     PATH
@@ -139,20 +139,19 @@ if(RAFT_ANN_BENCH_USE_RAFT)
     $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft/raft_ivf_pq.cu>
     $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft/raft_ivf_flat.cu>
     LINKS
-    raft::distance
-    raft::nn
+    raft::compiled
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS)
-  ConfigureCuannBench(
+  ConfigureAnnBench(
     NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)
   include(cmake/thirdparty/get_glog.cmake)
-  ConfigureCuannBench(
+  ConfigureAnnBench(
     NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu INCLUDES
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include LINKS glog::glog
   )
diff --git a/cpp/bench/ann/scripts/hdf5_to_fbin.py b/cpp/bench/ann/scripts/hdf5_to_fbin.py
index 3ee57b6981..cfeb184ea8 100755
--- a/cpp/bench/ann/scripts/hdf5_to_fbin.py
+++ b/cpp/bench/ann/scripts/hdf5_to_fbin.py
@@ -27,58 +27,59 @@ def write_bin(fname, data):
         data.tofile(f)
 
 
-if len(sys.argv) != 2 and len(sys.argv) != 3:
-    print(
-        "usage: %s [-n] <input>.hdf5\n" % (sys.argv[0]),
-        "  -n: normalize base/query set\n",
-        "outputs: <input>.base.fbin\n",
-        "         <input>.query.fbin\n",
-        "         <input>.groundtruth.neighbors.ibin\n",
-        "         <input>.groundtruth.distances.fbin",
-        file=sys.stderr,
-    )
-    sys.exit(-1)
-
-need_normalize = False
-if len(sys.argv) == 3:
-    assert sys.argv[1] == "-n"
-    need_normalize = True
-fname_prefix = sys.argv[-1]
-assert fname_prefix.endswith(".hdf5")
-fname_prefix = fname_prefix[:-5]
+if __name__ == "__main__":
+    if len(sys.argv) != 2 and len(sys.argv) != 3:
+        print(
+            "usage: %s [-n] <input>.hdf5\n" % (sys.argv[0]),
+            "  -n: normalize base/query set\n",
+            "outputs: <input>.base.fbin\n",
+            "         <input>.query.fbin\n",
+            "         <input>.groundtruth.neighbors.ibin\n",
+            "         <input>.groundtruth.distances.fbin",
+            file=sys.stderr,
+        )
+        sys.exit(-1)
 
-hdf5 = h5py.File(sys.argv[-1], "r")
-assert (
-    hdf5.attrs["distance"] == "angular"
-    or hdf5.attrs["distance"] == "euclidean"
-)
-assert hdf5["train"].dtype == np.float32
-assert hdf5["test"].dtype == np.float32
-assert hdf5["neighbors"].dtype == np.int32
-assert hdf5["distances"].dtype == np.float32
+    need_normalize = False
+    if len(sys.argv) == 3:
+        assert sys.argv[1] == "-n"
+        need_normalize = True
+    fname_prefix = sys.argv[-1]
+    assert fname_prefix.endswith(".hdf5")
+    fname_prefix = fname_prefix[:-5]
 
-base = hdf5["train"][:]
-query = hdf5["test"][:]
-if need_normalize:
-    base = normalize(base)
-    query = normalize(query)
-elif hdf5.attrs["distance"] == "angular":
-    print(
-        "warning: input has angular distance, specify -n to normalize base/query set!\n"
+    hdf5 = h5py.File(sys.argv[-1], "r")
+    assert (
+        hdf5.attrs["distance"] == "angular"
+        or hdf5.attrs["distance"] == "euclidean"
     )
+    assert hdf5["train"].dtype == np.float32
+    assert hdf5["test"].dtype == np.float32
+    assert hdf5["neighbors"].dtype == np.int32
+    assert hdf5["distances"].dtype == np.float32
+
+    base = hdf5["train"][:]
+    query = hdf5["test"][:]
+    if need_normalize:
+        base = normalize(base)
+        query = normalize(query)
+    elif hdf5.attrs["distance"] == "angular":
+        print(
+            "warning: input has angular distance, specify -n to normalize base/query set!\n"
+        )
 
-output_fname = fname_prefix + ".base.fbin"
-print("writing", output_fname, "...")
-write_bin(output_fname, base)
+    output_fname = fname_prefix + ".base.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, base)
 
-output_fname = fname_prefix + ".query.fbin"
-print("writing", output_fname, "...")
-write_bin(output_fname, query)
+    output_fname = fname_prefix + ".query.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, query)
 
-output_fname = fname_prefix + ".groundtruth.neighbors.ibin"
-print("writing", output_fname, "...")
-write_bin(output_fname, hdf5["neighbors"][:])
+    output_fname = fname_prefix + ".groundtruth.neighbors.ibin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["neighbors"][:])
 
-output_fname = fname_prefix + ".groundtruth.distances.fbin"
-print("writing", output_fname, "...")
-write_bin(output_fname, hdf5["distances"][:])
+    output_fname = fname_prefix + ".groundtruth.distances.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["distances"][:])
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 6eb75976f3..59e7244f33 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -45,9 +45,6 @@ using std::vector;
 
 namespace raft::bench::ann {
 
-// supported types: float, half (very few implementations support it), uint8_t, int8_t
-using data_t = float;
-
 inline bool check_file_exist(const std::vector<string>& files)
 {
   bool ret = true;
@@ -485,6 +482,52 @@ inline const std::string usage(const string& argv0)
          "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
 }
 
+template <typename T>
+inline int dispatch_benchmark(Configuration& conf,
+                              std::string& index_patterns,
+                              bool force_overwrite,
+                              bool only_check,
+                              bool build_mode,
+                              bool search_mode)
+{
+  try {
+    auto dataset_conf = conf.get_dataset_conf();
+
+    BinDataset<T> dataset(dataset_conf.name,
+                          dataset_conf.base_file,
+                          dataset_conf.subset_first_row,
+                          dataset_conf.subset_size,
+                          dataset_conf.query_file,
+                          dataset_conf.distance);
+
+    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
+    if (!check(indices, build_mode, force_overwrite)) { return -1; }
+
+    std::string message = "will ";
+    message += build_mode ? "build:" : "search:";
+    for (const auto& index : indices) {
+      message += "\n  " + index.name;
+    }
+    log_info("%s", message.c_str());
+
+    if (only_check) {
+      log_info("%s", "all check passed, quit due to option -c");
+      return 0;
+    }
+
+    if (build_mode) {
+      build(&dataset, indices);
+    } else if (search_mode) {
+      search(&dataset, indices);
+    }
+  } catch (const std::exception& e) {
+    log_error("exception occurred: %s", e.what());
+    return -1;
+  }
+
+  return 0;
+}
+
 inline int run_main(int argc, char** argv)
 {
   bool force_overwrite = false;
@@ -523,37 +566,23 @@ inline int run_main(int argc, char** argv)
 
   try {
     Configuration conf(conf_stream);
-
-    auto dataset_conf = conf.get_dataset_conf();
-    BinDataset<data_t> dataset(dataset_conf.name,
-                               dataset_conf.base_file,
-                               dataset_conf.subset_first_row,
-                               dataset_conf.subset_size,
-                               dataset_conf.query_file,
-                               dataset_conf.distance);
-
-    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
-    if (!check(indices, build_mode, force_overwrite)) { return -1; }
-
-    std::string message = "will ";
-    message += build_mode ? "build:" : "search:";
-    for (const auto& index : indices) {
-      message += "\n  " + index.name;
-    }
-    log_info("%s", message.c_str());
-
-    if (only_check) {
-      log_info("%s", "all check passed, quit due to option -c");
-      return 0;
+    std::string dtype = conf.get_dataset_conf().dtype;
+
+    if (dtype == "float") {
+      dispatch_benchmark<float>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else if (dtype == "uint8") {
+      dispatch_benchmark<std::uint8_t>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else if (dtype == "int8") {
+      dispatch_benchmark<std::int8_t>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else {
+      log_error("datatype %s not supported", dtype);
     }
 
-    if (build_mode) {
-      build(&dataset, indices);
-    } else if (search_mode) {
-      search(&dataset, indices);
-    }
   } catch (const std::exception& e) {
-    log_error("exception occurs: %s", e.what());
+    log_error("exception occurred: %s", e.what());
     return -1;
   }
 
diff --git a/cpp/bench/ann/src/common/conf.cpp b/cpp/bench/ann/src/common/conf.cpp
index 66a8d252b4..f690f68783 100644
--- a/cpp/bench/ann/src/common/conf.cpp
+++ b/cpp/bench/ann/src/common/conf.cpp
@@ -66,6 +66,21 @@ void Configuration::parse_dataset_(const nlohmann::json& conf)
     dataset_conf_.subset_first_row = conf.at("subset_first_row");
   }
   if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); }
+
+  if (conf.contains("dtype")) {
+    dataset_conf_.dtype = conf.at("dtype");
+  } else {
+    auto filename = dataset_conf_.base_file;
+    if (!filename.compare(filename.size() - 4, 4, "fbin")) {
+      dataset_conf_.dtype = "float";
+    } else if (!filename.compare(filename.size() - 5, 5, "u8bin")) {
+      dataset_conf_.dtype = "uint8";
+    } else if (!filename.compare(filename.size() - 5, 5, "i8bin")) {
+      dataset_conf_.dtype = "int8";
+    } else {
+      log_error("Could not determine data type of the dataset");
+    }
+  }
 }
 
 void Configuration::parse_index_(const nlohmann::json& index_conf,
diff --git a/cpp/bench/ann/src/common/conf.h b/cpp/bench/ann/src/common/conf.h
index c498a93ca1..845defe94a 100644
--- a/cpp/bench/ann/src/common/conf.h
+++ b/cpp/bench/ann/src/common/conf.h
@@ -52,6 +52,9 @@ class Configuration {
     size_t subset_size{0};
     std::string query_file;
     std::string distance;
+
+    // data type of input dataset, possible values ["float", "int8", "uint8"]
+    std::string dtype;
   };
 
   Configuration(std::istream& conf_stream);
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index c961683aa6..cb30c2693f 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -20,8 +20,7 @@
 #include <memory>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/distance_type.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <sstream>
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index b28c502a2c..d8e98ce2a9 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -22,6 +22,10 @@
 #include <type_traits>
 #include <utility>
 
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
 #include "../common/ann_types.hpp"
 #include "../common/benchmark_util.hpp"
 #undef WARP_SIZE
@@ -30,13 +34,15 @@
 #endif
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
 #include "raft_ivf_flat_wrapper.h"
-extern template class raft::bench::ann::RaftIvfFlatGpu<float, uint64_t>;
-extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, uint64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<int8_t, int64_t>;
 #endif
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
 #include "raft_ivf_pq_wrapper.h"
-extern template class raft::bench::ann::RaftIvfPQ<float, uint64_t>;
-extern template class raft::bench::ann::RaftIvfPQ<uint8_t, uint64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<int8_t, int64_t>;
 #endif
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
@@ -162,17 +168,17 @@ std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
 
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
   if (algo == "raft_ivf_flat") {
-    typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::BuildParam param;
-    parse_build_param<T, uint64_t>(conf, param);
-    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>>(metric, dim, param);
+    typename raft::bench::ann::RaftIvfFlatGpu<T, int64_t>::BuildParam param;
+    parse_build_param<T, int64_t>(conf, param);
+    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, int64_t>>(metric, dim, param);
   }
 #endif
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
   if (algo == "raft_ivf_pq") {
-    typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::BuildParam param;
-    parse_build_param<T, uint64_t>(conf, param);
+    typename raft::bench::ann::RaftIvfPQ<T, int64_t>::BuildParam param;
+    parse_build_param<T, int64_t>(conf, param);
     ann =
-      std::make_unique<raft::bench::ann::RaftIvfPQ<T, uint64_t>>(metric, dim, param, refine_ratio);
+      std::make_unique<raft::bench::ann::RaftIvfPQ<T, int64_t>>(metric, dim, param, refine_ratio);
   }
 #endif
   if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
@@ -194,15 +200,15 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
   if (algo == "raft_ivf_flat") {
     auto param =
-      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, uint64_t>::SearchParam>();
-    parse_search_param<T, uint64_t>(conf, *param);
+      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, int64_t>::SearchParam>();
+    parse_search_param<T, int64_t>(conf, *param);
     return param;
   }
 #endif
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
   if (algo == "raft_ivf_pq") {
-    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, uint64_t>::SearchParam>();
-    parse_search_param<T, uint64_t>(conf, *param);
+    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, int64_t>::SearchParam>();
+    parse_search_param<T, int64_t>(conf, *param);
     return param;
   }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
index 8c31652186..ff108080b5 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
@@ -15,7 +15,12 @@
  */
 #include "raft_ivf_flat_wrapper.h"
 
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
 namespace raft::bench::ann {
-template class RaftIvfFlatGpu<float, uint64_t>;
-template class RaftIvfFlatGpu<uint8_t, uint64_t>;
+template class RaftIvfFlatGpu<float, int64_t>;
+template class RaftIvfFlatGpu<uint8_t, int64_t>;
+template class RaftIvfFlatGpu<int8_t, int64_t>;
 }  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index bf6d37ed59..2896d992ea 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -23,11 +23,10 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/detail/distance.cuh>
-#include <raft/distance/distance_type.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/neighbors/ivf_flat.cuh>
 #include <raft/neighbors/ivf_flat_types.hpp>
-#include <raft/spatial/knn/detail/ivf_flat_build.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <stdexcept>
@@ -119,14 +118,14 @@ void RaftIvfFlatGpu<T, IdxT>::set_search_param(const AnnSearchParam& param)
 template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::save(const std::string& file) const
 {
-  raft::spatial::knn::ivf_flat::detail::serialize(handle_, file, *index_);
+  raft::neighbors::ivf_flat::serialize(handle_, file, *index_);
   return;
 }
 
 template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::load(const std::string& file)
 {
-  index_ = raft::spatial::knn::ivf_flat::detail::deserialize<T, IdxT>(handle_, file);
+  index_ = raft::neighbors::ivf_flat::deserialize<T, IdxT>(handle_, file);
   return;
 }
 
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
index 2de81545aa..338bc9a32f 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
@@ -15,7 +15,12 @@
  */
 #include "raft_ivf_pq_wrapper.h"
 
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
 namespace raft::bench::ann {
-template class RaftIvfPQ<float, uint64_t>;
-template class RaftIvfPQ<uint8_t, uint64_t>;
+template class RaftIvfPQ<float, int64_t>;
+template class RaftIvfPQ<uint8_t, int64_t>;
+template class RaftIvfPQ<int8_t, int64_t>;
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 5a0c4dd2b5..9b0dee6b84 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -21,7 +21,7 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
-#include <raft/distance/distance_type.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -54,7 +54,7 @@ class RaftIvfPQ : public ANN<T> {
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
   void set_search_param(const AnnSearchParam& param) override;
-  void set_search_dataset(const T* dataset, IdxT nrow) override;
+  void set_search_dataset(const T* dataset, size_t nrow) override;
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
@@ -119,8 +119,9 @@ void RaftIvfPQ<T, IdxT>::load(const std::string& file)
 template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
 {
-  index_.emplace(raft::runtime::neighbors::ivf_pq::build(
-    handle_, index_params_, dataset, IdxT(nrow), dimension_));
+  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), index_->dim());
+
+  index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
   return;
 }
 
@@ -133,7 +134,7 @@ void RaftIvfPQ<T, IdxT>::set_search_param(const AnnSearchParam& param)
 }
 
 template <typename T, typename IdxT>
-void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, IdxT nrow)
+void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
 {
   dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
 }
@@ -146,28 +147,20 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                 float* distances,
                                 cudaStream_t stream) const
 {
-  // raft::logger::get(raft::RAFT_NAME).set_level(RAFT_LEVEL_INFO);
-
-  rmm::mr::device_memory_resource* mr_ptr = &const_cast<RaftIvfPQ*>(this)->mr_;
   if (refine_ratio_ > 1.0f) {
-    uint32_t k0        = static_cast<uint32_t>(refine_ratio_ * k);
+    uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
+    auto queries_v =
+      raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
     auto distances_tmp = raft::make_device_matrix<float, IdxT>(handle_, batch_size, k0);
     auto candidates    = raft::make_device_matrix<IdxT, IdxT>(handle_, batch_size, k0);
 
-    raft::runtime::neighbors::ivf_pq::search(handle_,
-                                             search_params_,
-                                             *index_,
-                                             queries,
-                                             batch_size,
-                                             k0,
-                                             candidates.data_handle(),
-                                             distances_tmp.data_handle(),
-                                             mr_ptr);
+    raft::runtime::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view());
 
     if (get_property().dataset_memory_type == MemoryType::Device) {
       auto queries_v =
         raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
-      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>(neighbors, batch_size, k);
+      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
       auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
 
       raft::runtime::neighbors::refine(handle_,
@@ -200,21 +193,21 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                        distances_host.view(),
                                        index_->metric());
 
-      raft::copy(
-        neighbors, neighbors_host.data_handle(), neighbors_host.size(), handle_.get_stream());
+      raft::copy(neighbors,
+                 (size_t*)neighbors_host.data_handle(),
+                 neighbors_host.size(),
+                 handle_.get_stream());
       raft::copy(
         distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream());
     }
   } else {
-    raft::runtime::neighbors::ivf_pq::search(handle_,
-                                             search_params_,
-                                             *index_,
-                                             queries,
-                                             batch_size,
-                                             k,
-                                             (IdxT*)neighbors,
-                                             distances,
-                                             mr_ptr);
+    auto queries_v =
+      raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+    auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
+    auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
+
+    raft::runtime::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
   }
   handle_.sync_stream();
   return;
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index 377bb925b7..01f206ab70 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -18,7 +18,7 @@
 #include <cassert>
 #include <memory>
 #include <raft/distance/detail/distance.cuh>
-#include <raft/distance/distance_type.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <stdexcept>
 #include <string>
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index d90c2d9d3d..29ea169115 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -81,6 +81,8 @@ if(BUILD_PRIMS_BENCH)
     NAME
     DISTANCE
     PATH
+    bench/prims/distance/tune_pairwise/bench.cu
+    bench/prims/distance/tune_pairwise/kernel.cu
     bench/prims/distance/distance_cosine.cu
     bench/prims/distance/distance_exp_l2.cu
     bench/prims/distance/distance_l1.cu
diff --git a/cpp/bench/distance/tune_pairwise/bench.cu b/cpp/bench/prims/distance/tune_pairwise/bench.cu
similarity index 100%
rename from cpp/bench/distance/tune_pairwise/bench.cu
rename to cpp/bench/prims/distance/tune_pairwise/bench.cu
diff --git a/cpp/bench/distance/tune_pairwise/kernel.cu b/cpp/bench/prims/distance/tune_pairwise/kernel.cu
similarity index 100%
rename from cpp/bench/distance/tune_pairwise/kernel.cu
rename to cpp/bench/prims/distance/tune_pairwise/kernel.cu
diff --git a/cpp/bench/distance/tune_pairwise/kernel.cuh b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh
similarity index 100%
rename from cpp/bench/distance/tune_pairwise/kernel.cuh
rename to cpp/bench/prims/distance/tune_pairwise/kernel.cuh
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
new file mode 100644
index 0000000000..b7c132f2f1
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -0,0 +1,87 @@
+#=============================================================================
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_faiss)
+    set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+        rapids_find_generate_module(faiss
+                HEADER_NAMES  faiss/IndexFlat.h
+                LIBRARY_NAMES faiss
+                )
+
+        set(BUILD_SHARED_LIBS ON)
+        if (PKG_BUILD_STATIC_LIBS)
+            set(BUILD_SHARED_LIBS OFF)
+            set(CPM_DOWNLOAD_faiss ON)
+        endif()
+
+        rapids_cpm_find(faiss ${PKG_VERSION}
+                GLOBAL_TARGETS     faiss::faiss
+                CPM_ARGS
+                GIT_REPOSITORY   ${PKG_REPOSITORY}
+                GIT_TAG          ${PKG_PINNED_TAG}
+                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
+                OPTIONS
+                "FAISS_ENABLE_PYTHON OFF"
+                "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
+                "FAISS_ENABLE_GPU ON"
+                "BUILD_TESTING OFF"
+                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+                )
+
+        if(TARGET faiss AND NOT TARGET faiss::faiss)
+            add_library(faiss::faiss ALIAS faiss)
+        endif()
+
+        if(faiss_ADDED)
+            rapids_export(BUILD faiss
+                    EXPORT_SET faiss-targets
+                    GLOBAL_TARGETS faiss
+                    NAMESPACE faiss::)
+        endif()
+
+    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
+    rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it
+    rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss)
+    rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss)
+
+    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-ann-bench-exports)
+endfunction()
+
+if(NOT RAFT_FAISS_GIT_TAG)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0)
+    # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
+endif()
+
+if(NOT RAFT_FAISS_GIT_REPOSITORY)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(RAFT_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git)
+    # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
+endif()
+
+find_and_configure_faiss(VERSION    1.7.0
+        REPOSITORY  ${RAFT_FAISS_GIT_REPOSITORY}
+        PINNED_TAG  ${RAFT_FAISS_GIT_TAG}
+        BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
+        EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL})
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
index 109bdc03d4..5de98a47ce 100644
--- a/cpp/cmake/thirdparty/get_nlohmann_json.cmake
+++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
@@ -21,8 +21,8 @@ function(find_and_configure_nlohmann_json)
 
     rapids_cpm_find(nlohmann_json ${PKG_VERSION}
             GLOBAL_TARGETS      nlohmann_json::nlohmann_json
-            BUILD_EXPORT_SET    cuann_bench-exports
-            INSTALL_EXPORT_SET  cuann_bench-exports
+            BUILD_EXPORT_SET    raft-bench-ann-exports
+            INSTALL_EXPORT_SET  raft-bench-ann-exports
             CPM_ARGS
             GIT_REPOSITORY         https://github.com/${PKG_FORK}/json.git
             GIT_TAG                ${PKG_PINNED_TAG}
diff --git a/dependencies.yaml b/dependencies.yaml
index bf9fed9316..0143fab27a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -149,6 +149,9 @@ dependencies:
           - hnswlib
           - nlohmann_json
           - glog
+          - h5py
+          - libfaiss>=1.7.1
+          - faiss-proc=*=cuda
 
   cudatoolkit:
     specific:

From 2964fef723d89611d0c3e0b6ea18eb96af3448eb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 23 Mar 2023 19:06:41 -0400
Subject: [PATCH 21/39] Reverting benchmark move

---
 cpp/bench/prims/CMakeLists.txt | 73 ++++++++++++++++------------------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 29ea169115..f03a552c1d 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -15,26 +15,23 @@
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
-function(ConfigurePrimsBench)
+function(ConfigureBench)
 
-  set(options OPTIONAL DIST NN)
+  set(options OPTIONAL LIB)
   set(oneValueArgs NAME)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
-  cmake_parse_arguments(
-    ConfigurePrimsBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
-  )
+  cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  set(BENCH_NAME ${ConfigurePrimsBench_NAME}_PRIMS_BENCH)
+  set(BENCH_NAME ${ConfigureBench_NAME})
 
-  add_executable(${BENCH_NAME} ${ConfigurePrimsBench_PATH})
+  add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
 
   target_link_libraries(
     ${BENCH_NAME}
     PRIVATE raft::raft
             raft_internal
-            $<$<BOOL:${ConfigurePrimsBench_DIST}>:raft::distance>
-            $<$<BOOL:${ConfigurePrimsBench_NN}>:raft::nn>
+            $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
             benchmark::benchmark
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
@@ -58,9 +55,7 @@ function(ConfigurePrimsBench)
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  target_include_directories(
-    ${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"
-  )
+  target_include_directories(${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>")
 
   install(
     TARGETS ${BENCH_NAME}
@@ -71,18 +66,21 @@ function(ConfigurePrimsBench)
 
 endfunction()
 
-if(BUILD_PRIMS_BENCH)
-  ConfigurePrimsBench(
-    NAME CLUSTER PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
-    bench/prims/main.cpp OPTIONAL DIST NN
+if(BUILD_BENCH)
+  ConfigureBench(
+    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
+    bench/prims/main.cpp OPTIONAL LIB
+  )
+
+  ConfigureBench(
+    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
+    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
   )
 
-  ConfigurePrimsBench(
+  ConfigureBench(
     NAME
-    DISTANCE
+    DISTANCE_BENCH
     PATH
-    bench/prims/distance/tune_pairwise/bench.cu
-    bench/prims/distance/tune_pairwise/kernel.cu
     bench/prims/distance/distance_cosine.cu
     bench/prims/distance/distance_exp_l2.cu
     bench/prims/distance/distance_l1.cu
@@ -92,12 +90,12 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/distance/kernels.cu
     bench/prims/main.cpp
     OPTIONAL
-    DIST
+    LIB
   )
 
-  ConfigurePrimsBench(
+  ConfigureBench(
     NAME
-    LINALG
+    LINALG_BENCH
     PATH
     bench/prims/linalg/add.cu
     bench/prims/linalg/map_then_reduce.cu
@@ -110,35 +108,34 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/main.cpp
   )
 
-  ConfigurePrimsBench(
-    NAME MATRIX PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu bench/prims/main.cpp
+  ConfigureBench(
+    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB
   )
 
-  ConfigurePrimsBench(
-    NAME RANDOM PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
+  ConfigureBench(
+    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
     bench/prims/random/rng.cu bench/prims/main.cpp
   )
 
-  ConfigurePrimsBench(NAME SPARSE PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
+  ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
 
-  ConfigurePrimsBench(
+  ConfigureBench(
     NAME
-    NEIGHBORS
+    NEIGHBORS_BENCH
     PATH
     bench/prims/neighbors/knn/brute_force_float_int64_t.cu
     bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
     bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
     bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
     bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_float_uint64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_int8_t_uint64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_uint8_t_uint64_t.cu
-    bench/prims/neighbors/refine_float_uint64_t.cu
-    bench/prims/neighbors/refine_uint8_t_uint64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    bench/prims/neighbors/refine_float_int64_t.cu
+    bench/prims/neighbors/refine_uint8_t_int64_t.cu
     bench/prims/main.cpp
     OPTIONAL
-    DIST
-    NN
+    LIB
   )
 endif()

From a24ce8d2b0609d583b368978d3daaeaa8cb67790 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 23 Mar 2023 19:14:56 -0400
Subject: [PATCH 22/39] Using proper macro

---
 cpp/include/raft/sparse/solver/detail/lanczos.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 63bc98b404..67d6f6c412 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -962,7 +962,7 @@ int computeSmallestEigenvectors(
                                 (*effIter) * nEigVecs * sizeof(value_type_t),
                                 cudaMemcpyHostToDevice,
                                 stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
@@ -1312,7 +1312,7 @@ int computeLargestEigenvectors(
                                 cudaMemcpyHostToDevice,
                                 stream));
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,

From c3994005c275e9982bcbe36a0a2758e032ff29bc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 23 Mar 2023 19:25:10 -0400
Subject: [PATCH 23/39] Replacing more instances

---
 .../raft/cluster/detail/kmeans_deprecated.cuh |  4 +--
 cpp/include/raft/linalg/detail/lanczos.cuh    |  4 +--
 .../raft/solver/detail/lap_functions.cuh      | 32 +++++++++----------
 cpp/include/raft/solver/linear_assignment.cuh |  4 +--
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index a9d8777304..e0fa1030da 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -383,7 +383,7 @@ static int chooseNewCentroid(raft::device_resources const& handle,
                          thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   CUDA_TRY(cudaMemcpyAsync(
     &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
@@ -606,7 +606,7 @@ static int assignCentroids(raft::device_resources const& handle,
   gridDim.y  = 1;
   gridDim.z  = 1;
   minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Compute residual sum of squares
   *residual_host = thrust::reduce(
diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 8c0cfeba28..73d93ab535 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -958,7 +958,7 @@ int computeSmallestEigenvectors(
                                 (*effIter) * nEigVecs * sizeof(value_type_t),
                                 cudaMemcpyHostToDevice,
                                 stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
@@ -1305,7 +1305,7 @@ int computeLargestEigenvectors(
                                 cudaMemcpyHostToDevice,
                                 stream));
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index 440e6901c6..63f27e6346 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -113,7 +113,7 @@ inline void initialReduction(raft::device_resources const& handle,
   kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
   kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_costs,
     d_vertices_dev.row_duals,
@@ -121,7 +121,7 @@ inline void initialReduction(raft::device_resources const& handle,
     SP,
     N,
     std::numeric_limits<weight_t>::max());
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
@@ -159,7 +159,7 @@ inline void computeInitialAssignments(raft::device_resources const& handle,
     SP,
     N,
     epsilon);
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
@@ -191,7 +191,7 @@ inline int computeRowCovers(raft::device_resources const& handle,
   kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
 }
@@ -268,7 +268,7 @@ inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
                                          0,
                                          handle.get_stream()>>>(
       predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
     thrust::exclusive_scan(
@@ -286,7 +286,7 @@ inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
         SP,
         N);
 
-      CHECK_CUDA(handle.get_stream());
+      RAFT_CHECK_CUDA(handle.get_stream());
     }
   }
 
@@ -356,7 +356,7 @@ inline void reversePass(raft::device_resources const& handle,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
   std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
@@ -375,11 +375,11 @@ inline void reversePass(raft::device_resources const& handle,
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
   }
 }
 
@@ -410,7 +410,7 @@ inline void augmentationPass(raft::device_resources const& handle,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
   // TODO: should be vertex_t
@@ -432,7 +432,7 @@ inline void augmentationPass(raft::device_resources const& handle,
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       d_vertices_dev.row_assignments,
@@ -443,7 +443,7 @@ inline void augmentationPass(raft::device_resources const& handle,
       vertex_t{N},
       row_ids_csr_size);
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
   }
 }
 
@@ -471,7 +471,7 @@ inline void dualUpdate(raft::device_resources const& handle,
     N,
     std::numeric_limits<weight_t>::max());
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP);
   kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
@@ -488,7 +488,7 @@ inline void dualUpdate(raft::device_resources const& handle,
     std::numeric_limits<weight_t>::max(),
     epsilon);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for calculating optimal objective function value using dual variables.
@@ -508,7 +508,7 @@ inline void calcObjValDual(raft::device_resources const& handle,
   kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for calculating optimal objective function value using dual variables.
@@ -529,7 +529,7 @@ inline void calcObjValPrimal(raft::device_resources const& handle,
   kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_costs, d_row_assignments, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 }  // namespace raft::solver::detail
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 7904c04ede..6e66bafe1f 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -170,7 +170,7 @@ class LinearAssignmentProblem {
   {
     weight_t result;
     raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
+    RAFT_CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
@@ -183,7 +183,7 @@ class LinearAssignmentProblem {
   {
     weight_t result;
     raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
+    RAFT_CHECK_CUDA(handle_.get_stream());
     return result;
   }
 

From d327bd14fdebb4631181d203b9bdd1b341e45dce Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 23 Mar 2023 19:26:32 -0400
Subject: [PATCH 24/39] CUDA_TRY

---
 .../raft/cluster/detail/kmeans_deprecated.cuh      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index e0fa1030da..bb1d122a24 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -384,7 +384,7 @@ static int chooseNewCentroid(raft::device_resources const& handle,
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
   RAFT_CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
@@ -523,7 +523,7 @@ static int initializeCentroids(raft::device_resources const& handle,
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
-    CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
     computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
       n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
     RAFT_CHECK_CUDA(stream);
@@ -534,7 +534,7 @@ static int initializeCentroids(raft::device_resources const& handle,
   }
 
   // Compute cluster sizes
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
   RAFT_CHECK_CUDA(stream);
 
@@ -598,7 +598,7 @@ static int assignCentroids(raft::device_resources const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
@@ -825,8 +825,8 @@ int kmeans(raft::device_resources const& handle,
 
   // Trivial cases
   if (k == 1) {
-    CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(
+    RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+    RAFT_CUDA_TRY(
       cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
     if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
@@ -837,7 +837,7 @@ int kmeans(raft::device_resources const& handle,
                  1,
                  std::min(ceildiv<unsigned>(n, BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
-    CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
     computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
     RAFT_CHECK_CUDA(stream);
     *residual_host = thrust::reduce(

From ff724f3c8bb6595c6626b105c3edaeecd5094ab0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 23 Mar 2023 19:30:19 -0400
Subject: [PATCH 25/39] Using proper return code

---
 cpp/bench/ann/src/common/benchmark.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 59e7244f33..b4d8fbeee3 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -569,13 +569,13 @@ inline int run_main(int argc, char** argv)
     std::string dtype = conf.get_dataset_conf().dtype;
 
     if (dtype == "float") {
-      dispatch_benchmark<float>(
+      return dispatch_benchmark<float>(
         conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
     } else if (dtype == "uint8") {
-      dispatch_benchmark<std::uint8_t>(
+      return dispatch_benchmark<std::uint8_t>(
         conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
     } else if (dtype == "int8") {
-      dispatch_benchmark<std::int8_t>(
+      return dispatch_benchmark<std::int8_t>(
         conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
     } else {
       log_error("datatype %s not supported", dtype);
@@ -586,6 +586,6 @@ inline int run_main(int argc, char** argv)
     return -1;
   }
 
-  return 0;
+  return -1;
 }
 };  // namespace raft::bench::ann

From 5937d75923582d01ec6fd12db516e883f36e74a0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 12:50:14 -0400
Subject: [PATCH 26/39] Adding conda generated file for ann benchmarks

---
 dependencies.yaml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 0143fab27a..bdec1a6262 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -15,6 +15,15 @@ files:
       - run_pylibraft
       - test_python_common
       - test_pylibraft
+  bench_ann:
+    output: conda
+    matrix:
+      cuda: ["11.8"]
+      arch: [x86_64]
+    includes:
+      - nn_bench
+      - build
+      - cudatoolkit
   test_cpp:
     output: none
     includes:
@@ -26,11 +35,6 @@ files:
       - py_version
       - test_python_common
       - test_pylibraft
-  bench_ann:
-    output: none
-    includes:
-      - cudatoolkit
-      - nn_bench
   checks:
     output: none
     includes:

From aeb448ed8a0d0718e35cb80213bb4ede66928b7e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 12:50:36 -0400
Subject: [PATCH 27/39] Adding actual conda dev file

---
 .../bench_ann_cuda-118_arch-x86_64.yaml       | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
new file mode 100644
index 0000000000..6d7974d53f
--- /dev/null
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,35 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- c-compiler
+- cmake>=3.23.1,!=3.25.0
+- cuda-profiler-api=11.8.86
+- cudatoolkit=11.8
+- cxx-compiler
+- cython>=0.29,<0.30
+- faiss-proc=*=cuda
+- gcc_linux-64=11.*
+- glog
+- h5py
+- hnswlib
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- libfaiss>=1.7.1
+- nccl>=2.9.9
+- ninja
+- nlohmann_json
+- scikit-build>=0.13.1
+- sysroot_linux-64==2.17
+name: bench_ann_cuda-118_arch-x86_64

From ed74608e46e2c43b631a773ede7d72d2e370d6f6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 13:33:20 -0400
Subject: [PATCH 28/39] Checking in

---
 build.sh                                               | 1 -
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml | 2 ++
 dependencies.yaml                                      | 3 ++-
 docs/source/cuda_ann_benchmarks.md                     | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/build.sh b/build.sh
index b6a6567cdc..ca81c72914 100755
--- a/build.sh
+++ b/build.sh
@@ -376,7 +376,6 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-          -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
           -DRAFT_NVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 6d7974d53f..c5873302f6 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -8,6 +8,8 @@ channels:
 - nvidia
 dependencies:
 - c-compiler
+- clang-tools=11.1.0
+- clang=11.1.0
 - cmake>=3.23.1,!=3.25.0
 - cuda-profiler-api=11.8.86
 - cudatoolkit=11.8
diff --git a/dependencies.yaml b/dependencies.yaml
index bdec1a6262..ee40da283e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -21,9 +21,10 @@ files:
       cuda: ["11.8"]
       arch: [x86_64]
     includes:
-      - nn_bench
       - build
+      - develop
       - cudatoolkit
+      - nn_bench
   test_cpp:
     output: none
     includes:
diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md
index 9ed9d2ffa1..d5292d8be2 100644
--- a/docs/source/cuda_ann_benchmarks.md
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -6,7 +6,7 @@ This project provides a benchmark program for various ANN search implementations
 
 ### Dependencies
 
-TODO: Need to fill in a conda environment file and direct users to it
+
 
 ### compiling benchmark
 

From 94d01586c6c9c1abf660523604dd6ac0acd2fadc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 13:59:39 -0400
Subject: [PATCH 29/39] Making sure we find Threads even when googletest isn't
 installed

---
 cpp/CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index aa3553439a..68d3ef4345 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -59,14 +59,20 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(RAFT_NVTX "Enable nvtx markers" OFF)
 
 set(RAFT_COMPILE_LIBRARY_DEFAULT OFF)
-if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
+if(BUILD_TESTS
+   OR BUILD_PRIMS_BENCH
+   OR BUILD_ANN_BENCH
+)
   set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
 endif()
 option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
+if(BUILD_TESTS
+   OR BUILD_PRIMS_BENCH
+   OR BUILD_ANN_BENCH
+)
   # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
   # to have different values for the `Threads::Threads` target. Setting this flag ensures
   # `Threads::Threads` is the same value in first run and subsequent runs.

From 7b770c6839218662eea4b635e4d17aec2f7ed6c6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 15:35:43 -0400
Subject: [PATCH 30/39] Updating docs

---
 cpp/bench/ann/CMakeLists.txt       |   2 +
 cpp/bench/ann/README.md            |   3 +
 docs/source/cuda_ann_benchmarks.md | 105 +++++++++++++++++------------
 3 files changed, 66 insertions(+), 44 deletions(-)
 create mode 100644 cpp/bench/ann/README.md

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index f3fbb84646..df4e498a4c 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -24,6 +24,8 @@ option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchm
 option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
 
+find_package(Threads REQUIRED)
+
 set(RAFT_ANN_BENCH_USE_FAISS OFF)
 if(RAFT_ANN_BENCH_USE_FAISS_BFKNN
    OR RAFT_ANN_BENCH_USE_FAISS_IVFPQ
diff --git a/cpp/bench/ann/README.md b/cpp/bench/ann/README.md
new file mode 100644
index 0000000000..1a8af2e448
--- /dev/null
+++ b/cpp/bench/ann/README.md
@@ -0,0 +1,3 @@
+# RAFT CUDA ANN Benchmarks
+
+Please see the [ANN Benchmarks](https://docs.rapids.ai/api/raft/stable/cuda_ann_benchmarks.html) section of the RAFT documentation for instructions on building and using the ANN benchmarks.
\ No newline at end of file
diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md
index d5292d8be2..708f5f7dba 100644
--- a/docs/source/cuda_ann_benchmarks.md
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -1,16 +1,37 @@
 # CUDA ANN Benchmarks
 
-This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations.
+This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU.
 
 ## Benchmark
 
 ### Dependencies
 
+CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. 
 
+Please refer to the  [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. 
 
-### compiling benchmark
+In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include:
+1. FAISS GPU >= 1.7.1
+2. Google Logging (GLog)
+3. H5Py
+4. HNSWLib
+5. nlohmann_json
+6. GGNN
+
+[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically.
+
+The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks:
 
-The easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms:
+```bash
+mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+conda activate raft_ann_benchmarks
+```
+
+The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`.
+
+### Compiling the Benchmarks
+
+After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms:
 ```bash
 ./build.sh bench-ann
 ```
@@ -30,28 +51,29 @@ Available targets to use with `--limit-bench-ann` are:
 - RAFT_IVF_FLAT_ANN_BENCH
 - RAFT_BFKNN_ANN_BENCH
 
-By default, the `*_ANN_BENCH` executables program accept dataset of `float` type. To use other type, change the line `using data_t = float;` in `cpp/bench/ann/src/benchmark.cu` (or `cpp/bench/ann/src/benchmark/cpp` if benchmarking a non-CUDA algorithm) to the target type. For example, `using data_t = uint8_t;` will enable running `benchmark` with dataset of `uint8_t` type.
-
+By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported.
 
 ### Usage
-There are 4 steps to run the benchmark:
-1. prepare dataset
-2. build index
-3. search using built index
-4. evaluate result
-
-#### TL;DR
-A complete example (run from the RAFT source code root directory):
-```
+There are 4 general steps to running the benchmarks:
+1. Prepare Dataset
+2. Build Index
+3. Search Using Built Index
+4. Evaluate Result
+
+#### End-to-end Example
+An end-to-end example (run from the RAFT source code root directory):
+```bash
 # (1) prepare a dataset
-pip3 install numpy h5py # if they have not been installed already
 pushd
+
 cd cpp/bench/ann
 mkdir data && cd data
 wget http://ann-benchmarks.com/glove-100-angular.hdf5
+
 # option -n is used here to normalize vectors so cosine distance is converted
 # to inner product; don't use -n for l2 distance
 python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5
+
 mkdir glove-100-inner
 mv glove-100-angular.base.fbin glove-100-inner/base.fbin
 mv glove-100-angular.query.fbin glove-100-inner/query.fbin
@@ -77,8 +99,7 @@ popd
 # optional step: plot QPS-Recall figure using data in result.csv with your favorite tool
 ```
 
-
-#### step 1: preparing dataset
+##### Step 1: Prepare Dataset
 A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation.
 
 The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
@@ -87,14 +108,14 @@ These binary files are little-endian and the format is: the first 8 bytes are `n
 Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
 
 Commonly used datasets can be downloaded from two websites:
-1.  Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
+1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
 
     However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
-    ```
+    ```bash
     pip3 install numpy h5py
     ```
     The usage of this script is:
-    ```
+    ```bash
     $ cpp/bench/ann/scripts/hdf5_to_fbin.py
     usage: scripts/hdf5_to_fbin.py [-n] <input>.hdf5
        -n: normalize base/query set
@@ -107,14 +128,13 @@ Commonly used datasets can be downloaded from two websites:
 
     Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset.
 
-
-2.  Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
-    ```
+2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
+    ```bash
     $ cpp/bench/ann/scripts/split_groundtruth.pl
     usage: script/split_groundtruth.pl input output_prefix
     ```
     Take Deep-1B dataset as an example:
-    ```
+    ```bash
     pushd
     cd cpp/bench/ann
     mkdir -p data/deep-1B && cd data/deep-1B
@@ -127,7 +147,7 @@ Commonly used datasets can be downloaded from two websites:
     Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
 
 
-#### step 2: building index
+##### Step 2: Build Index
 An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk.
 
 To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
@@ -144,9 +164,8 @@ To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configurat
     - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept.
 
 
-
 The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables:
-```
+```bash
 $ ./cpp/build/*_ANN_BENCH -h
 usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
    -b: build mode, will build index
@@ -165,7 +184,7 @@ usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
 * `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option.
 
 It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains:
-```
+```json
   "index" : [
     {
       "name" : "hnsw1",
@@ -182,7 +201,7 @@ It's easier to describe the usage of `-i` option with an example. Suppose we hav
   ]
 ```
 Then,
-```
+```bash
 # build all indices: hnsw1, hnsw2 and faiss
 ./cpp/build/HNSWLIB_ANN_BENCH -b a.json
 
@@ -201,13 +220,13 @@ Then,
 In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell.
 
 
-#### step 3: searching
+##### Step 3: Searching
 Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2.
 
 
-#### step 4: evaluating results
+##### Step 4: Evaluating Results
 Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is:
-```
+```bash
 $ cpp/bench/ann/scripts/eval.pl
 usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
   result_paths... are paths to the search result files.
@@ -220,7 +239,7 @@ usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
 ```
 Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files.
 An example:
-```
+```bash
 cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \
   result/glove-100-angular/10/hnsw/angular_M_24_*.txt \
   result/glove-100-angular/10/faiss/
@@ -231,13 +250,11 @@ This script prints recall and QPS for every result file. Also, it outputs estima
 
 It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o <output.csv>` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves.
 
-
-
-## How to add a new ANN algorithm
+## Adding a new ANN algorithm
 Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions.
 
 In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
-```
+```c++
 template<typename T>
 class HnswLib : public ANN<T> {
 public:
@@ -258,24 +275,24 @@ public:
 ```
 
 The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example:
-```
+```json
 {
   "name" : "...",
   "algo" : "hnswlib",
   "build_param": {"M":12, "efConstruction":500, "numThreads":32},
-  "file" : "...",
+  "file" : "/path/to/file",
   "search_params" : [
     {"ef":10, "numThreads":1},
     {"ef":20, "numThreads":1},
     {"ef":40, "numThreads":1},
   ],
-  "search_result_file" : "..."
+  "search_result_file" : "/path/to/file"
 },
 ```
 
 How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`:
-* First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
-    ```
+1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
+    ```c++
     template<typename T>
     void parse_build_param(const nlohmann::json& conf,
                            typename cuann::HnswLib<T>::BuildParam& param) {
@@ -296,8 +313,8 @@ How to interpret these JSON objects is totally left to the implementation and sh
     }
     ```
 
-* Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
-    ```
+2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
+    ```c++
       // JSON configuration file contains a line like:  "algo" : "hnswlib"
       if (algo == "hnswlib") {
          // ...

From 4f30b9b35e64ff34e9fe94c1c7d25ec445a26d96 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 16:57:42 -0400
Subject: [PATCH 31/39] Adding conda recipe

---
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  8 ++--
 .../recipes/libraft/build_libraft_nn_bench.sh |  5 ++
 conda/recipes/libraft/conda_build_config.yaml | 12 +++++
 conda/recipes/libraft/meta.yaml               | 46 +++++++++++++++++++
 cpp/bench/ann/CMakeLists.txt                  |  2 +-
 dependencies.yaml                             |  8 ++--
 6 files changed, 72 insertions(+), 9 deletions(-)
 create mode 100644 conda/recipes/libraft/build_libraft_nn_bench.sh

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index c5873302f6..e577435160 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -17,9 +17,9 @@ dependencies:
 - cython>=0.29,<0.30
 - faiss-proc=*=cuda
 - gcc_linux-64=11.*
-- glog
-- h5py
-- hnswlib
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib>=0.7.0
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -31,7 +31,7 @@ dependencies:
 - libfaiss>=1.7.1
 - nccl>=2.9.9
 - ninja
-- nlohmann_json
+- nlohmann_json>=3.11.2
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/libraft/build_libraft_nn_bench.sh b/conda/recipes/libraft/build_libraft_nn_bench.sh
new file mode 100644
index 0000000000..dc6250f0f4
--- /dev/null
+++ b/conda/recipes/libraft/build_libraft_nn_bench.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+./build.sh tests bench-ann --allgpuarch --no-nvtx
+cmake --install cpp/build --component ann_bench
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index e1079f4db8..43c9126f89 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -19,6 +19,18 @@ nccl_version:
 gtest_version:
   - "=1.10.0"
 
+glog_version:
+  - ">=0.6.0"
+
+faiss_version:
+  - ">=1.7.1"
+
+hnswlib_version:
+  - ">=0.7.0"
+
+nlohmann_json_version:
+  - ">=3.11.2"
+
 # The CTK libraries below are missing from the conda-forge::cudatoolkit
 # package. The "*_host_*" version specifiers correspond to `11.8` packages and the
 # "*_run_*" version specifiers correspond to `11.x` packages.
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index f911166a9a..52166da0cb 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -186,3 +186,49 @@ outputs:
       home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft template
+  - name: libraft-ann-bench
+    version: {{ version }}
+    script: build_libraft_tests.sh
+    build:
+      script_env: *script_env
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      ignore_run_exports_from:
+        - {{ compiler('cuda') }}
+    requirements:
+      build:
+        - {{ compiler('c') }}
+        - {{ compiler('cuda') }} {{ cuda_version }}
+        - {{ compiler('cxx') }}
+        - cmake {{ cmake_version }}
+        - ninja
+        - sysroot_{{ target_platform }} {{ sysroot_version }}
+      host:
+        - {{ pin_subpackage('libraft', exact=True) }}
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
+        - glog {{ glog_version }}
+        - hnswlib {{ hnswlib_version }}
+        - nlohmann_json {{ nlohmann_json_version }}
+        - libfaiss>=1.7.1
+        - faiss-proc=*=cuda
+      run:
+        - {{ pin_subpackage('libraft', exact=True) }}
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - glog {{ glog_version }}
+        - faiss-proc=*=cuda
+        - libfaiss {{ faiss_version }}
+        - h5py {{ h5py_version }}
+        - hnswlib {{ hnswlib_version }}
+    about:
+      home: https://rapids.ai/
+      license: Apache-2.0
+      summary: libraft ann bench
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index df4e498a4c..e4f00e879e 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -119,7 +119,7 @@ function(ConfigureAnnBench)
 
   install(
     TARGETS ${BENCH_NAME}
-    COMPONENT testing
+    COMPONENT ann_bench
     DESTINATION bin/ann
     EXCLUDE_FROM_ALL
   )
diff --git a/dependencies.yaml b/dependencies.yaml
index ee40da283e..51a1f5294b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,10 +151,10 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - hnswlib
-          - nlohmann_json
-          - glog
-          - h5py
+          - hnswlib>=0.7.0
+          - nlohmann_json>=3.11.2
+          - glog>=0.6.0
+          - h5py>=3.8.0
           - libfaiss>=1.7.1
           - faiss-proc=*=cuda
 

From 8cd5c07adba6239f724c3a1809ca2645a1ca5a6b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 17:04:16 -0400
Subject: [PATCH 32/39] Adding h5py version

---
 conda/recipes/libraft/conda_build_config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 43c9126f89..3eca49082c 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -28,6 +28,9 @@ faiss_version:
 hnswlib_version:
   - ">=0.7.0"
 
+h5py_version:
+  - ">=3.8.0"
+
 nlohmann_json_version:
   - ">=3.11.2"
 

From b134133c4430c50efaf7fdb9f2640fc8836ba35b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 17:29:27 -0400
Subject: [PATCH 33/39] Hnswlib recipe update

---
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml | 2 +-
 conda/recipes/libraft/conda_build_config.yaml          | 2 +-
 conda/recipes/libraft/meta.yaml                        | 2 +-
 dependencies.yaml                                      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index e577435160..5965aaef8f 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib>=0.7.0
+- hnswlib=0.7.0
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 3eca49082c..d3a390ff02 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -26,7 +26,7 @@ faiss_version:
   - ">=1.7.1"
 
 hnswlib_version:
-  - ">=0.7.0"
+  - "=0.7.0"
 
 h5py_version:
   - ">=3.8.0"
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 52166da0cb..995c3f0c5c 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -188,7 +188,7 @@ outputs:
       summary: libraft template
   - name: libraft-ann-bench
     version: {{ version }}
-    script: build_libraft_tests.sh
+    script: build_libraft_ann_bench.sh
     build:
       script_env: *script_env
       number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 51a1f5294b..64fd7cd454 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,7 +151,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - hnswlib>=0.7.0
+          - hnswlib=0.7.0
           - nlohmann_json>=3.11.2
           - glog>=0.6.0
           - h5py>=3.8.0

From d4d5fdd880610eb61015353b0cb15376978c7c81 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 17:34:03 -0400
Subject: [PATCH 34/39] Using proper filename

---
 conda/recipes/libraft/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 995c3f0c5c..384d5d8551 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -188,7 +188,7 @@ outputs:
       summary: libraft template
   - name: libraft-ann-bench
     version: {{ version }}
-    script: build_libraft_ann_bench.sh
+    script: build_libraft_nn_bench.sh
     build:
       script_env: *script_env
       number: {{ GIT_DESCRIBE_NUMBER }}

From 5d093a370618f37ad22165559a0b4555b5232b7c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 18:07:24 -0400
Subject: [PATCH 35/39] Trying again

---
 conda/recipes/libraft/conda_build_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index d3a390ff02..6fc5cf4ef1 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -26,7 +26,7 @@ faiss_version:
   - ">=1.7.1"
 
 hnswlib_version:
-  - "=0.7.0"
+  - "=0.7.*"
 
 h5py_version:
   - ">=3.8.0"

From d8cd9f7909192b17cdd272342402a163f79028e3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 24 Mar 2023 18:25:52 -0400
Subject: [PATCH 36/39] Removing hnswlib from the conda recipe

---
 conda/recipes/libraft/conda_build_config.yaml | 3 ---
 conda/recipes/libraft/meta.yaml               | 2 --
 2 files changed, 5 deletions(-)

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 6fc5cf4ef1..2a66f213a7 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -25,9 +25,6 @@ glog_version:
 faiss_version:
   - ">=1.7.1"
 
-hnswlib_version:
-  - "=0.7.*"
-
 h5py_version:
   - ">=3.8.0"
 
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 384d5d8551..7859807777 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -216,7 +216,6 @@ outputs:
         - libcusparse {{ libcusparse_host_version }}
         - libcusparse-dev {{ libcusparse_host_version }}
         - glog {{ glog_version }}
-        - hnswlib {{ hnswlib_version }}
         - nlohmann_json {{ nlohmann_json_version }}
         - libfaiss>=1.7.1
         - faiss-proc=*=cuda
@@ -227,7 +226,6 @@ outputs:
         - faiss-proc=*=cuda
         - libfaiss {{ faiss_version }}
         - h5py {{ h5py_version }}
-        - hnswlib {{ hnswlib_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0

From 66f2b48b10faab357eb7d6b174f4b523e6ac0b74 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 27 Mar 2023 17:40:15 -0400
Subject: [PATCH 37/39] Adding FindAVX.cmake

---
 cpp/bench/ann/CMakeLists.txt           |   2 +-
 cpp/cmake/modules/FindAVX.cmake        | 108 +++++++++++++++++++++++++
 cpp/cmake/thirdparty/get_hnswlib.cmake |  11 +++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 cpp/cmake/modules/FindAVX.cmake

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index e4f00e879e..6267be518e 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -128,7 +128,7 @@ endfunction()
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureAnnBench(
     NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES
-    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS -mavx
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS "${HNSW_CXX_FLAGS}"
   )
 endif()
 
diff --git a/cpp/cmake/modules/FindAVX.cmake b/cpp/cmake/modules/FindAVX.cmake
new file mode 100644
index 0000000000..db0b3830a3
--- /dev/null
+++ b/cpp/cmake/modules/FindAVX.cmake
@@ -0,0 +1,108 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+INCLUDE(CheckCXXSourceRuns)
+
+SET(AVX_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+"
+)
+
+SET(AVX512_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i b = a;
+    __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+    return 0;
+  }
+"
+)
+
+SET(AVX2_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256i a = {0};
+    a = _mm256_abs_epi16(a);
+    __m256i x;
+    _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+    return 0;
+  }
+"
+)
+
+MACRO(CHECK_SSE lang type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${lang}_${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      IF(${lang}_HAS_${type}_${__FLAG_I})
+        SET(${lang}_${type}_FOUND
+            TRUE
+            CACHE BOOL "${lang} ${type} support"
+        )
+        SET(${lang}_${type}_FLAGS
+            "${__FLAG}"
+            CACHE STRING "${lang} ${type} flags"
+        )
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${lang}_${type}_FOUND)
+    SET(${lang}_${type}_FOUND
+        FALSE
+        CACHE BOOL "${lang} ${type} support"
+    )
+    SET(${lang}_${type}_FLAGS
+        ""
+        CACHE STRING "${lang} ${type} flags"
+    )
+  ENDIF()
+
+  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
+
+ENDMACRO()
+
+# CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX") CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2") CHECK_SSE(C
+# "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
+#
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+CHECK_SSE(CXX "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index f092ff6428..94033e8333 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -27,6 +27,17 @@ function(find_and_configure_hnswlib)
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
 
     endif ()
+
+    include(cmake/modules/FindAVX.cmake)
+
+    set(HNSW_CXX_FLAGS "")
+    if(CXX_AVX_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX_FLAGS}")
+    elseif(CXX_AVX2_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX2_FLAGS}")
+    elseif(CXX_AVX512_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX512_FLAGS}")
+    endif()
 endfunction()
 
 # Change pinned tag here to test a commit in CI

From a7392707157e0ccb322afd9730e32e055363d412 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 28 Mar 2023 11:33:56 -0400
Subject: [PATCH 38/39] Adding copyrights and proper attribution for PyTorch
 copied cmake file

---
 .pre-commit-config.yaml             |  2 +-
 build.sh                            |  8 +--
 ci/checks/copyright.py              |  3 +-
 cpp/cmake/modules/FindAVX.cmake     | 22 +++++----
 thirdparty/LICENSES/LICENSE.pytorch | 77 +++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 16 deletions(-)
 create mode 100644 thirdparty/LICENSES/LICENSE.pytorch

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 630b8788f8..d6e4ecb676 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -62,7 +62,7 @@ repos:
                 entry: ./cpp/scripts/run-cmake-format.sh cmake-format
                 language: python
                 types: [cmake]
-                exclude: .*/thirdparty/.*
+                exclude: .*/thirdparty/.*|.*FindAVX.cmake.*
                 # Note that pre-commit autoupdate does not update the versions
                 # of dependencies, so we'll have to update this manually.
                 additional_dependencies:
diff --git a/build.sh b/build.sh
index e30138d79c..3758dc26c4 100755
--- a/build.sh
+++ b/build.sh
@@ -4,18 +4,18 @@
 
 # raft build scripts
 
-# This scripts is used to build the component(s) in this repo from
+# This script is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
 # build as needed (see the help output for details)
 
-# Abort scripts on first error
+# Abort script on first error
 set -e
 
 NUMARGS=$#
 ARGS=$*
 
 # NOTE: ensure all dir changes are relative to the location of this
-# scripts, and that this scripts resides in the repo dir!
+# scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
 VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall  -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn -h"
@@ -59,7 +59,7 @@ RAFT_DASK_BUILD_DIR=${REPODIR}/python/raft-dask/_skbuild
 PYLIBRAFT_BUILD_DIR=${REPODIR}/python/pylibraft/_skbuild
 BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PYLIBRAFT_BUILD_DIR} ${RAFT_DASK_BUILD_DIR}"
 
-# Set defaults for vars modified by flags to this scripts
+# Set defaults for vars modified by flags to this script
 CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index a44314a6ce..123aeba87b 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -192,7 +192,8 @@ def checkCopyright_main():
                            action="append",
                            required=False,
                            default=["python/cuml/_thirdparty/",
-                                    "cpp/include/raft/thirdparty/"],
+                                    "cpp/include/raft/thirdparty/",
+                                    "cpp/cmake/modules/FindAVX.cmake"],
                            help=("Exclude the paths specified (regexp). "
                                  "Can be specified multiple times."))
 
diff --git a/cpp/cmake/modules/FindAVX.cmake b/cpp/cmake/modules/FindAVX.cmake
index db0b3830a3..7f3b2dfc76 100644
--- a/cpp/cmake/modules/FindAVX.cmake
+++ b/cpp/cmake/modules/FindAVX.cmake
@@ -1,15 +1,17 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 #
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# Note: This file was copied from PyTorch and modified for use in the RAFT library.
+# Refer to thirdparty/LICENSES/LICENSE.pytorch for license and additional
+# copyright information.
 # =============================================================================
 
 INCLUDE(CheckCXXSourceRuns)
diff --git a/thirdparty/LICENSES/LICENSE.pytorch b/thirdparty/LICENSES/LICENSE.pytorch
new file mode 100644
index 0000000000..7ad3d737a5
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.pytorch
@@ -0,0 +1,77 @@
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file

From 9fe55acdce837f8cebc5c2c871394dd74a4ecb92 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 28 Mar 2023 11:37:54 -0400
Subject: [PATCH 39/39] Addressing remaining review feedback

---
 cpp/bench/ann/conf/deep-100M.json              | 1 -
 cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h | 1 -
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h   | 3 +--
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/cpp/bench/ann/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json
index ade590d35c..b3a945d50e 100644
--- a/cpp/bench/ann/conf/deep-100M.json
+++ b/cpp/bench/ann/conf/deep-100M.json
@@ -4,7 +4,6 @@
     "base_file" : "data/deep-1B/base.1B.fbin",
     "subset_size" : 100000000,
     "query_file" : "data/deep-1B/query.public.10K.fbin",
-    // although distance should be "euclidean", faiss becomes much slower for that
     "distance" : "euclidean"
   },
 
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 2896d992ea..8b2a7d329b 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -84,7 +84,6 @@ class RaftIvfFlatGpu : public ANN<T> {
   std::optional<raft::neighbors::ivf_flat::index<T, IdxT>> index_;
   int device_;
   int dimension_;
-  const int serialization_version = 1;
   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
 };
 
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 9b0dee6b84..70dff81847 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -84,8 +84,7 @@ class RaftIvfPQ : public ANN<T> {
   std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
   int device_;
   int dimension_;
-  float refine_ratio_             = 1.0;
-  const int serialization_version = 1;
+  float refine_ratio_ = 1.0;
   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
   raft::device_matrix_view<const T, IdxT> dataset_;
 };