From ffab8f6eb0fb60a8d09b40d0f31debc3b7be312b Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 6 Oct 2023 11:05:33 -0400
Subject: [PATCH] Fix NN Descent overflows (#1875)

NN-Descent was using `int` type for indexing in `mdarray`, however this was causing an overflow when the product of all extents was greater than `int`.

This PR also adds/fixes:

- Missing dependencies for `raft-ann-bench` development environment
- Exposes NN Descent iterations to use in CAGRA benchmarks

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/1875
---
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  4 +++
 cpp/bench/ann/src/raft/raft_benchmark.cu      |  1 +
 cpp/include/raft/neighbors/cagra.cuh          |  1 +
 cpp/include/raft/neighbors/cagra_types.hpp    |  2 ++
 .../raft/neighbors/detail/nn_descent.cuh      | 36 +++++++++----------
 dependencies.yaml                             |  2 ++
 docs/source/ann_benchmarks_param_tuning.md    |  1 +
 7 files changed, 29 insertions(+), 18 deletions(-)
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 4f1df12dfa..2527fdd1fc 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -31,10 +31,14 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libfaiss>=1.7.1
+- matplotlib
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
+- pandas
+- pyyaml
+- rmm==23.12.*
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index a9ff6c2922..2f005566a9 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -154,6 +154,7 @@ void parse_build_param(const nlohmann::json& conf,
       param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT;
     }
   }
+  if (conf.contains("nn_descent_niter")) { param.nn_descent_niter = conf.at("nn_descent_niter"); }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index f9682a973f..1efb4da95e 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -318,6 +318,7 @@ index<T, IdxT> build(raft::resources const& res,
     auto nn_descent_params                      = experimental::nn_descent::index_params();
     nn_descent_params.graph_degree              = intermediate_degree;
     nn_descent_params.intermediate_graph_degree = 1.5 * intermediate_degree;
+    nn_descent_params.max_iterations            = params.nn_descent_niter;
     build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
   }
 
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 5061d6082d..4db08110b9 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -58,6 +58,8 @@ struct index_params : ann::index_params {
   size_t graph_degree = 64;
   /** ANN algorithm to build knn graph. */
   graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+  /** Number of Iterations to run if building with NN_DESCENT */
+  size_t nn_descent_niter = 20;
 };
 
 enum class search_algo {
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index 1fb568a934..ce77cdc3de 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -362,28 +362,28 @@ class GNND {
   GnndGraph<Index_t> graph_;
   std::atomic<int64_t> update_counter_;
 
-  Index_t nrow_;
-  const int ndim_;
+  size_t nrow_;
+  size_t ndim_;
 
-  raft::device_matrix<__half, Index_t, raft::row_major> d_data_;
-  raft::device_vector<DistData_t, Index_t> l2_norms_;
+  raft::device_matrix<__half, size_t, raft::row_major> d_data_;
+  raft::device_vector<DistData_t, size_t> l2_norms_;
 
-  raft::device_matrix<ID_t, Index_t, raft::row_major> graph_buffer_;
-  raft::device_matrix<DistData_t, Index_t, raft::row_major> dists_buffer_;
+  raft::device_matrix<ID_t, size_t, raft::row_major> graph_buffer_;
+  raft::device_matrix<DistData_t, size_t, raft::row_major> dists_buffer_;
 
   // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
   thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
   thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
 
-  raft::device_vector<int, Index_t> d_locks_;
+  raft::device_vector<int, size_t> d_locks_;
 
   thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
   thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
   thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
   // int2.x is the number of forward edges, int2.y is the number of reverse edges
 
-  raft::device_vector<int2, Index_t> d_list_sizes_new_;
-  raft::device_vector<int2, Index_t> d_list_sizes_old_;
+  raft::device_vector<int2, size_t> d_list_sizes_new_;
+  raft::device_vector<int2, size_t> d_list_sizes_old_;
 };
 
 constexpr int TILE_ROW_WIDTH = 64;
@@ -1143,21 +1143,21 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
            NUM_SAMPLES),
     nrow_(build_config.max_dataset_size),
     ndim_(build_config.dataset_dim),
-    d_data_{raft::make_device_matrix<__half, Index_t, raft::row_major>(
+    d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>(
       res, nrow_, build_config.dataset_dim)},
-    l2_norms_{raft::make_device_vector<DistData_t, Index_t>(res, nrow_)},
+    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, nrow_)},
     graph_buffer_{
-      raft::make_device_matrix<ID_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+      raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     dists_buffer_{
-      raft::make_device_matrix<DistData_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+      raft::make_device_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
     dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
-    d_locks_{raft::make_device_vector<int, Index_t>(res, nrow_)},
+    d_locks_{raft::make_device_vector<int, size_t>(res, nrow_)},
     h_rev_graph_new_(nrow_ * NUM_SAMPLES),
     h_graph_old_(nrow_ * NUM_SAMPLES),
     h_rev_graph_old_(nrow_ * NUM_SAMPLES),
-    d_list_sizes_new_{raft::make_device_vector<int2, Index_t>(res, nrow_)},
-    d_list_sizes_old_{raft::make_device_vector<int2, Index_t>(res, nrow_)}
+    d_list_sizes_new_{raft::make_device_vector<int2, size_t>(res, nrow_)},
+    d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
 
@@ -1342,8 +1342,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   for (size_t i = 0; i < (size_t)nrow_; i++) {
     for (size_t j = 0; j < build_config_.node_degree; j++) {
       size_t idx = i * graph_.node_degree + j;
-      Index_t id = graph_.h_graph[idx].id();
-      if (id < nrow_) {
+      int id     = graph_.h_graph[idx].id();
+      if (id < static_cast<int>(nrow_)) {
         graph_shrink_buffer[i * build_config_.node_degree + j] = id;
       } else {
         graph_shrink_buffer[i * build_config_.node_degree + j] =
diff --git a/dependencies.yaml b/dependencies.yaml
index fe4a4620e0..e3fd7d0679 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -29,6 +29,7 @@ files:
       - develop
       - cudatoolkit
       - nn_bench
+      - nn_bench_python
   test_cpp:
     output: none
     includes:
@@ -228,6 +229,7 @@ dependencies:
           - libfaiss>=1.7.1
           - benchmark>=1.8.2
           - faiss-proc=*=cuda
+          - *rmm_conda
   nn_bench_python:
     common:
       - output_types: [conda]
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 433df2ae2f..e79ef36932 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -49,6 +49,7 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g
 | `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
 | `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
 | `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
+| `nn_descent_niter`          | `build_param`  | N        | Positive Integer>0         | 20 | Number of iterations if using NN_DESCENT. |
 | `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
 | `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |