CAGRA: Separate graph index sorting functionality from prune function (…

…#1471) # Changes This PR separates the graph index sorting functionality from the CAGRA pruning function and creates a new function. (Related issue: #1446) # Unit test I have included a new unit test for the sorting function. The test utilizes a separate dataset from the one used in the CAGRA main test to avoid the effect of rounding errors during norm computation between two vectors in the dataset. More details are in the source code. https://github.com/enp1s0/raft/blob/ea6c449c260895e9125a591a4848eed06f5b72c4/cpp/test/neighbors/ann_cagra.cuh#L93-L96 # Issue Close #1446 Authors: - tsuki (https://github.com/enp1s0) - Tamas Bela Feher (https://github.com/tfeher) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: #1471
rapidsai · May 10, 2023 · cc4a76b · cc4a76b
1 parent 2014112
commit cc4a76b
Show file tree

Hide file tree

Showing 6 changed files with 306 additions and 70 deletions.
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
@@ -52,8 +52,8 @@ namespace raft::neighbors::experimental::cagra {
  * @code{.cpp}
  *   using namespace raft::neighbors;
  *   // use default index parameters
- *   ivf_pq::index_params build_params;
- *   ivf_pq::search_params search_params
+ *   cagra::index_params build_params;
+ *   cagra::search_params search_params
  *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // create knn graph
  *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
@@ -84,34 +84,71 @@ void build_knn_graph(raft::device_resources const& res,
   detail::build_knn_graph(res, dataset, knn_graph, refine_rate, build_params, search_params);
 }
 
+/**
+ * @brief Sort a KNN graph index.
+ * Preprocessing step for `cagra::prune`: If a KNN graph is not built using
+ * `cagra::build_knn_graph`, then it is necessary to call this function before calling
+ * `cagra::prune`. If the graph is built by `cagra::build_knn_graph`, it is already sorted and you
+ * do not need to call this function.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   cagra::index_params build_params;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // build KNN graph not using `cagra::build_knn_graph`
+ *   // build(knn_graph, dataset, ...);
+ *   // sort graph index
+ *   sort_knn_graph(res, dataset.view(), knn_graph.view());
+ *   // prune graph
+ *   cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view());
+ *   // Construct an index from dataset and pruned knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset, pruned_graph.view());
+ * @endcode
+ *
+ * @tparam DataT type of the data in the source dataset
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[in,out] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
+ * knn_graph_degree]
+ */
+template <typename DataT,
+          typename IdxT = uint32_t,
+          typename d_accessor =
+            host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void sort_knn_graph(raft::device_resources const& res,
+                    mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
+                    mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
+{
+  detail::graph::sort_knn_graph(res, dataset, knn_graph);
+}
+
 /**
  * @brief Prune a KNN graph.
  *
  * Decrease the number of neighbors for each node.
  *
  * See [cagra::build_knn_graph](#cagra::build_knn_graph) for usage example
  *
- * @tparam T data element type
  * @tparam IdxT type of the indices in the source dataset
  *
  * @param[in] res raft resources
- * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
  * @param[in] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
  * knn_graph_degree]
  * @param[out] new_graph a host matrix view of the pruned knn graph [n_rows, graph_degree]
  */
-template <class DATA_T,
-          typename IdxT = uint32_t,
-          typename d_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
+template <typename IdxT = uint32_t,
           typename g_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
 void prune(raft::device_resources const& res,
-           mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
            mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
            raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
 {
-  detail::graph::prune(res, dataset, knn_graph, new_graph);
+  detail::graph::prune(res, knn_graph, new_graph);
 }
 
 /**
@@ -138,11 +175,11 @@ void prune(raft::device_resources const& res,
  *   // create and fill the index from a [N, D] dataset
  *   auto index = cagra::build(res, index_params, dataset);
  *   // use default search parameters
- *   ivf_pq::search_params search_params;
+ *   cagra::search_params search_params;
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   ivf_pq::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors, distances);
  * @endcode
  *
  * @tparam T data element type
@@ -178,7 +215,7 @@ index<T, IdxT> build(raft::device_resources const& res,
 
   auto cagra_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), params.graph_degree);
 
-  prune<T, IdxT>(res, dataset, knn_graph.view(), cagra_graph.view());
+  prune<IdxT>(res, knn_graph.view(), cagra_graph.view());
 
   // Construct an index from dataset and pruned knn graph.
   return index<T, IdxT>(res, params.metric, dataset, cagra_graph.view());

diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -405,36 +405,24 @@ void shift_array(T* array, uint64_t num)
   }
 }
 
-/** Input arrays can be both host and device*/
-template <class DATA_T,
+template <typename DataT,
           typename IdxT = uint32_t,
           typename d_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
+            host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
           typename g_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
-void prune(raft::device_resources const& res,
-           mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
-           mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
-           raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void sort_knn_graph(raft::device_resources const& res,
+                    mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
+                    mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
 {
-  RAFT_LOG_DEBUG(
-    "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+  RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
+               "dataset size is expected to have the same number of graph index size");
+  const uint32_t dataset_size = dataset.extent(0);
+  const uint32_t dataset_dim  = dataset.extent(1);
+  const DataT* dataset_ptr    = dataset.data_handle();
 
-  RAFT_EXPECTS(
-    dataset.extent(0) == knn_graph.extent(0) && knn_graph.extent(0) == new_graph.extent(0),
-    "Each input array is expected to have the same number of rows");
-  RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
-               "output graph cannot have more columns than input graph");
-  const uint32_t dataset_size        = dataset.extent(0);
-  const uint32_t dataset_dim         = dataset.extent(1);
-  const uint32_t input_graph_degree  = knn_graph.extent(1);
-  const uint32_t output_graph_degree = new_graph.extent(1);
-  const DATA_T* dataset_ptr          = dataset.data_handle();
-  uint32_t* input_graph_ptr          = (uint32_t*)knn_graph.data_handle();
-  uint32_t* output_graph_ptr         = new_graph.data_handle();
-  float scale                  = 1.0f / raft::spatial::knn::detail::utils::config<DATA_T>::kDivisor;
-  const std::size_t graph_size = dataset_size;
-  size_t array_size;
+  const uint32_t input_graph_degree = knn_graph.extent(1);
+  uint32_t* input_graph_ptr         = (uint32_t*)knn_graph.data_handle();
 
   // Setup GPUs
   int num_gpus = 0;
@@ -451,46 +439,48 @@ void prune(raft::device_resources const& res,
   }
   RAFT_CUDA_TRY(cudaSetDevice(0));
 
-  uint32_t graph_chunk_size     = graph_size;
-  uint32_t*** d_input_graph_ptr = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
-  graph_chunk_size              = (graph_size + num_gpus - 1) / num_gpus;
+  const uint32_t graph_size       = knn_graph.extent(0);
+  uint32_t*** d_input_graph_ptr   = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
+  const uint32_t graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
   d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);
 
-  uint32_t dataset_chunk_size = dataset_size;
-  DATA_T*** d_dataset_ptr     = NULL;  // [num_gpus+1][...][...]
-  dataset_chunk_size          = (dataset_size + num_gpus - 1) / num_gpus;
+  DataT*** d_dataset_ptr            = NULL;  // [num_gpus+1][...][...]
+  const uint32_t dataset_chunk_size = (dataset_size + num_gpus - 1) / num_gpus;
   assert(dataset_chunk_size == graph_chunk_size);
-  d_dataset_ptr = mgpu_alloc<DATA_T>(num_gpus, dataset_chunk_size, dataset_dim);
+  d_dataset_ptr = mgpu_alloc<DataT>(num_gpus, dataset_chunk_size, dataset_dim);
 
-  mgpu_H2D<DATA_T>(
+  const float scale = 1.0f / raft::spatial::knn::detail::utils::config<DataT>::kDivisor;
+
+  mgpu_H2D<DataT>(
     d_dataset_ptr, dataset_ptr, num_gpus, dataset_size, dataset_chunk_size, dataset_dim);
 
-  //
-  // Sorting kNN graph
-  //
   double time_sort_start = cur_time();
   RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");
-  mgpu_H2D<uint32_t>(
-    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  mgpu_H2D<uint32_t>(d_input_graph_ptr,
+                     input_graph_ptr,
+                     num_gpus,
+                     dataset_size,
+                     graph_chunk_size,
+                     input_graph_degree);
   void (*kernel_sort)(
-    DATA_T**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
+    DataT**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
   constexpr int numElementsPerThread = 4;
   dim3 threads_sort(1, 1, 1);
   if (input_graph_degree <= numElementsPerThread * 32) {
     constexpr int blockDim_x = 32;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else if (input_graph_degree <= numElementsPerThread * 64) {
     constexpr int blockDim_x = 64;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else if (input_graph_degree <= numElementsPerThread * 128) {
     constexpr int blockDim_x = 128;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else if (input_graph_degree <= numElementsPerThread * 256) {
     constexpr int blockDim_x = 256;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else {
     fprintf(stderr,
@@ -510,21 +500,68 @@ void prune(raft::device_resources const& res,
                                                dataset_dim,
                                                scale,
                                                d_input_graph_ptr[i_gpu],
-                                               graph_size,
+                                               dataset_size,
                                                graph_chunk_size,
                                                input_graph_degree,
                                                i_gpu);
   }
   RAFT_CUDA_TRY(cudaSetDevice(0));
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   RAFT_LOG_DEBUG(".");
-  mgpu_D2H<uint32_t>(
-    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  mgpu_D2H<uint32_t>(d_input_graph_ptr,
+                     input_graph_ptr,
+                     num_gpus,
+                     dataset_size,
+                     graph_chunk_size,
+                     input_graph_degree);
   RAFT_LOG_DEBUG("\n");
   double time_sort_end = cur_time();
   RAFT_LOG_DEBUG("# Sorting kNN graph time: %.1lf sec\n", time_sort_end - time_sort_start);
 
-  mgpu_free<DATA_T>(d_dataset_ptr, num_gpus);
+  mgpu_free<DataT>(d_dataset_ptr, num_gpus);
+}
+
+/** Input arrays can be both host and device*/
+template <typename IdxT = uint32_t,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void prune(raft::device_resources const& res,
+           mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
+           raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
+{
+  RAFT_LOG_DEBUG(
+    "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+
+  RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
+               "Each input array is expected to have the same number of rows");
+  RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
+               "output graph cannot have more columns than input graph");
+  const uint32_t input_graph_degree  = knn_graph.extent(1);
+  const uint32_t output_graph_degree = new_graph.extent(1);
+  uint32_t* input_graph_ptr          = (uint32_t*)knn_graph.data_handle();
+  uint32_t* output_graph_ptr         = new_graph.data_handle();
+  const std::size_t graph_size       = new_graph.extent(0);
+  size_t array_size;
+
+  // Setup GPUs
+  int num_gpus = 0;
+
+  // Setup GPUs
+  RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus));
+  RAFT_LOG_DEBUG("# num_gpus: %d\n", num_gpus);
+  for (int self = 0; self < num_gpus; self++) {
+    RAFT_CUDA_TRY(cudaSetDevice(self));
+    for (int peer = 0; peer < num_gpus; peer++) {
+      if (self == peer) { continue; }
+      RAFT_CUDA_TRY(cudaDeviceEnablePeerAccess(peer, 0));
+    }
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+
+  uint32_t graph_chunk_size     = graph_size;
+  uint32_t*** d_input_graph_ptr = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
+  graph_chunk_size              = (graph_size + num_gpus - 1) / num_gpus;
+  d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);
 
   //
   uint8_t* detour_count;  // [graph_size, input_graph_degree]