From 07bdc85029d59bfba1998eda9df65ec2e0b835fe Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 20 Jul 2022 11:42:01 +0200
Subject: [PATCH] sparse: Implement review feedback

- Rename dense_bool_to_unsorted_csr to adj_to_csr
- Add grid-stride loops for test case generation (both bench and test)
- Remove overload

In addition:
- Add test case for empty input
- Fix behavior in case of empty input (return early)
---
 cpp/bench/sparse/convert_csr.cu               | 55 ++++++++------
 cpp/include/raft/sparse/convert/csr.cuh       | 51 +++----------
 .../{dense_to_csr.cuh => adj_to_csr.cuh}      | 72 +++++--------------
 cpp/test/sparse/convert_csr.cu                | 42 +++++++----
 4 files changed, 89 insertions(+), 131 deletions(-)
 rename cpp/include/raft/sparse/convert/detail/{dense_to_csr.cuh => adj_to_csr.cuh} (69%)
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu
index dfcdc1ff90..0e701518ab 100644
--- a/cpp/bench/sparse/convert_csr.cu
+++ b/cpp/bench/sparse/convert_csr.cu
@@ -24,36 +24,50 @@
 
 namespace raft::bench::sparse {
 
+template <typename index_t>
 struct bench_param {
-  size_t num_cols;
-  size_t num_rows;
-  size_t divisor;
+  index_t num_cols;
+  index_t num_rows;
+  index_t divisor;
 };
 
 template <typename index_t>
-__global__ void init_adj(bool* adj, index_t num_rows, index_t num_cols, int divisor)
+__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
 {
   index_t r = blockDim.y * blockIdx.y + threadIdx.y;
   index_t c = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if (r < num_rows && c < num_cols) { adj[r * num_cols + c] = c % divisor == 0; }
+  for (; r < num_rows; r += gridDim.y * blockDim.y) {
+    for (; c < num_cols; c += gridDim.x * blockDim.x) {
+      adj[r * num_cols + c] = c % divisor == 0;
+    }
+  }
+}
+
+template <typename index_t>
+void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
+{
+  // adj matrix: element a_ij is set to one if j is divisible by divisor.
+  dim3 block(32, 32);
+  const index_t max_y_grid_dim = 65535;
+  dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
+  init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
+  RAFT_CHECK_CUDA(stream);
 }
 
 template <typename index_t>
 struct bench_base : public fixture {
-  bench_base(const bench_param& p)
+  bench_base(const bench_param<index_t>& p)
     : params(p),
       handle(stream),
       adj(p.num_rows * p.num_cols, stream),
       row_ind(p.num_rows, stream),
       row_ind_host(p.num_rows),
       row_counters(p.num_rows, stream),
-      col_ind(p.num_rows * p.num_cols,
-              stream)  // This is over-dimensioned because nnz is unknown at this point
+      // col_ind is over-dimensioned because nnz is unknown at this point
+      col_ind(p.num_rows * p.num_cols, stream)
   {
-    dim3 block(32, 32);
-    dim3 grid(p.num_cols / 32 + 1, p.num_rows / 32 + 1);
-    init_adj<index_t><<<grid, block, 0, stream>>>(adj.data(), p.num_rows, p.num_cols, p.divisor);
+    init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);
 
     std::vector<index_t> row_ind_host(p.num_rows);
     for (size_t i = 0; i < row_ind_host.size(); ++i) {
@@ -66,13 +80,13 @@ struct bench_base : public fixture {
   void run_benchmark(::benchmark::State& state) override
   {
     loop_on_state(state, [this]() {
-      raft::sparse::convert::dense_bool_to_unsorted_csr<index_t>(handle,
-                                                                 adj.data(),
-                                                                 row_ind.data(),
-                                                                 params.num_rows,
-                                                                 params.num_cols,
-                                                                 row_counters.data(),
-                                                                 col_ind.data());
+      raft::sparse::convert::adj_to_csr<index_t>(handle,
+                                                 adj.data(),
+                                                 row_ind.data(),
+                                                 params.num_rows,
+                                                 params.num_cols,
+                                                 row_counters.data(),
+                                                 col_ind.data());
     });
 
     // Estimate bandwidth:
@@ -96,7 +110,7 @@ struct bench_base : public fixture {
 
  protected:
   raft::handle_t handle;
-  bench_param params;
+  bench_param<index_t> params;
   rmm::device_uvector<bool> adj;
   rmm::device_uvector<index_t> row_ind;
   std::vector<index_t> row_ind_host;
@@ -106,7 +120,7 @@ struct bench_base : public fixture {
 
 const int64_t num_cols = 1 << 30;
 
-const std::vector<bench_param> bench_params = {
+const std::vector<bench_param<int64_t>> bench_params = {
   {num_cols, 1, 8},
   {num_cols >> 3, 1 << 3, 8},
   {num_cols >> 6, 1 << 6, 8},
@@ -121,6 +135,5 @@ const std::vector<bench_param> bench_params = {
 };
 
 RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);
-// RAFT_BENCH_REGISTER(bench_base<int>, "", bench_params);
 
 }  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 048e6469de..abdacdc426 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -18,8 +18,8 @@
 
 #pragma once
 
+#include <raft/sparse/convert/detail/adj_to_csr.cuh>
 #include <raft/sparse/convert/detail/csr.cuh>
-#include <raft/sparse/convert/detail/dense_to_csr.cuh>
 #include <raft/sparse/csr.hpp>
 
 namespace raft {
@@ -90,49 +90,16 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
  *                         number of non-zeros in adj.
  */
 template <typename index_t = int>
-void dense_bool_to_unsorted_csr(
-  const raft::handle_t& handle,
-  const bool* adj,         // Row-major adjacency matrix
-  const index_t* row_ind,  // Precomputed row indices
-  index_t num_rows,        // # rows of adj
-  index_t num_cols,        // # cols of adj
-  index_t* tmp,            // Pre-allocated atomic counters. Minimum size: num_rows elements.
-  index_t* out_col_ind     // Output column indices
+void adj_to_csr(const raft::handle_t& handle,
+                const bool* adj,         // Row-major adjacency matrix
+                const index_t* row_ind,  // Precomputed row indices
+                index_t num_rows,        // # rows of adj
+                index_t num_cols,        // # cols of adj
+                index_t* tmp,  // Pre-allocated atomic counters. Minimum size: num_rows elements.
+                index_t* out_col_ind  // Output column indices
 )
 {
-  detail::dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
-}
-
-/**
- * @brief Converts a boolean adjacency matrix into unsorted CSR format.
- *
- * The conversion supports non-square matrices.
- *
- * @tparam     index_t     Indexing arithmetic type
- *
- * @param[in]  handle      RAFT handle
- * @param[in]  adj         A num_rows x num_cols boolean matrix in contiguous row-major
- *                         format.
- * @param[in]  row_ind     An array of length num_rows that indicates at which index
- *                         a row starts in out_col_ind. Equivalently, it is the
- *                         exclusive scan of the number of non-zeros in each row of
- *                         adj.
- * @param[in]  num_rows    Number of rows of adj.
- * @param[in]  num_cols    Number of columns of adj.
- * @param[out] out_col_ind An array containing the column indices of the
- *                         non-zero values in adj. Size should be at least the
- *                         number of non-zeros in adj.
- */
-template <typename index_t = int>
-void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
-                                const bool* adj,         // row-major adjacency matrix
-                                const index_t* row_ind,  // precomputed row indices
-                                index_t num_rows,        // # rows of adj
-                                index_t num_cols,        // # cols of adj
-                                index_t* out_col_ind     // output column indices
-)
-{
-  detail::dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, out_col_ind);
+  detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/convert/detail/dense_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
similarity index 69%
rename from cpp/include/raft/sparse/convert/detail/dense_to_csr.cuh
rename to cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
index 6d28f486f1..16ed8c8c90 100644
--- a/cpp/include/raft/sparse/convert/detail/dense_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -42,8 +42,7 @@ namespace detail {
  * block. If there are more SMs than rows, multiple blocks operate on a single
  * row. To enable cooperation between these blocks, each row is provided a
  * counter where the current output index can be cooperatively (atomically)
- * incremented. As a result, the order of the output indices is not guaranteed
- * to be in order.
+ * incremented. As a result, the order of the output indices is not guaranteed.
  *
  * @param[in]     adj          A num_rows x num_cols boolean matrix in contiguous row-major
  *                             format.
@@ -59,13 +58,12 @@ namespace detail {
  *                             the number of non-zeros in `adj`.
  */
 template <typename index_t>
-__global__ void dense_bool_to_unsorted_csr_kernel(
-  const bool* adj,         // row-major adjacency matrix
-  const index_t* row_ind,  // precomputed row indices
-  index_t num_rows,        // # rows of adj
-  index_t num_cols,        // # cols of adj
-  index_t* row_counters,   // pre-allocated (zeroed) atomic counters
-  index_t* out_col_ind     // output column indices
+__global__ void adj_to_csr_kernel(const bool* adj,         // row-major adjacency matrix
+                                  const index_t* row_ind,  // precomputed row indices
+                                  index_t num_rows,        // # rows of adj
+                                  index_t num_cols,        // # cols of adj
+                                  index_t* row_counters,   // pre-allocated (zeroed) atomic counters
+                                  index_t* out_col_ind     // output column indices
 )
 {
   const int chunk_size = 16;
@@ -127,18 +125,20 @@ __global__ void dense_bool_to_unsorted_csr_kernel(
  *                         number of non-zeros in adj.
  */
 template <typename index_t = int>
-void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
-                                const bool* adj,         // row-major adjacency matrix
-                                const index_t* row_ind,  // precomputed row indices
-                                index_t num_rows,        // # rows of adj
-                                index_t num_cols,        // # cols of adj
-                                index_t* tmp,            // pre-allocated atomic counters
-                                index_t* out_col_ind     // output column indices
+void adj_to_csr(const raft::handle_t& handle,
+                const bool* adj,         // row-major adjacency matrix
+                const index_t* row_ind,  // precomputed row indices
+                index_t num_rows,        // # rows of adj
+                index_t num_cols,        // # cols of adj
+                index_t* tmp,            // pre-allocated atomic counters
+                index_t* out_col_ind     // output column indices
 )
 {
   auto stream = handle.get_stream();
 
-  RAFT_EXPECTS(tmp != nullptr, "dense_bool_to_unsorted_csr: tmp workspace may not be null.");
+  // Check inputs and return early if possible.
+  if (num_rows == 0 || num_cols == 0) { return; }
+  RAFT_EXPECTS(tmp != nullptr, "adj_to_csr: tmp workspace may not be null.");
 
   // Zero-fill a temporary vector that is be used by the kernel to keep track of
   // the number of entries added to a row.
@@ -153,7 +153,7 @@ void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
   cudaGetDevice(&dev_id);
   cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm, dense_bool_to_unsorted_csr_kernel<index_t>, threads_per_block, 0);
+    &blocks_per_sm, adj_to_csr_kernel<index_t>, threads_per_block, 0);
 
   index_t max_active_blocks = sm_count * blocks_per_sm;
   index_t blocks_per_row    = raft::ceildiv(max_active_blocks, num_rows);
@@ -161,45 +161,11 @@ void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
   dim3 block(threads_per_block, 1);
   dim3 grid(blocks_per_row, grid_rows);
 
-  dense_bool_to_unsorted_csr_kernel<index_t>
+  adj_to_csr_kernel<index_t>
     <<<grid, block, 0, stream>>>(adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/**
- * @brief Converts a boolean adjacency matrix into unsorted CSR format.
- *
- * The conversion supports non-square matrices.
- *
- * @tparam     index_t     Indexing arithmetic type
- *
- * @param[in]  handle      RAFT handle
- * @param[in]  adj         A num_rows x num_cols boolean matrix in contiguous row-major
- *                         format.
- * @param[in]  row_ind     An array of length num_rows that indicates at which index
- *                         a row starts in out_col_ind. Equivalently, it is the
- *                         exclusive scan of the number of non-zeros in each row of
- *                         adj.
- * @param[in]  num_rows    Number of rows of adj.
- * @param[in]  num_cols    Number of columns of adj.
- * @param[out] out_col_ind An array containing the column indices of the
- *                         non-zero values in adj. Size should be at least the
- *                         number of non-zeros in adj.
- */
-template <typename index_t = int>
-void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
-                                const bool* adj,         // row-major adjacency matrix
-                                const index_t* row_ind,  // precomputed row indices
-                                index_t num_rows,        // # rows of adj
-                                index_t num_cols,        // # cols of adj
-                                index_t* out_col_ind     // output column indices
-)
-{
-  auto stream = handle.get_stream();
-  rmm::device_uvector<index_t> tmp(num_rows, stream);
-  dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, tmp.data(), out_col_ind);
-}
-
 };  // end NAMESPACE detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 4731fefa83..a217a90e19 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -89,12 +89,27 @@ INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesI
 /******************************** adj graph ********************************/
 
 template <typename index_t>
-__global__ void init_adj(bool* adj, index_t num_rows, index_t num_cols, int divisor)
+__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
 {
   index_t r = blockDim.y * blockIdx.y + threadIdx.y;
   index_t c = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if (r < num_rows && c < num_cols) { adj[r * num_cols + c] = c % divisor == 0; }
+  for (; r < num_rows; r += gridDim.y * blockDim.y) {
+    for (; c < num_cols; c += gridDim.x * blockDim.x) {
+      adj[r * num_cols + c] = c % divisor == 0;
+    }
+  }
+}
+
+template <typename index_t>
+void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
+{
+  // adj matrix: element a_ij is set to one if j is divisible by divisor.
+  dim3 block(32, 32);
+  const index_t max_y_grid_dim = 65535;
+  dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
+  init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
+  RAFT_CHECK_CUDA(stream);
 }
 
 template <typename index_t>
@@ -123,11 +138,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
   {
     // Initialize adj matrix: element a_ij equals one if j is divisible by
     // params.divisor.
-    dim3 block(32, 32);
-    dim3 grid(params.n_cols / 32 + 1, params.n_rows / 32 + 1);
-    init_adj<index_t>
-      <<<grid, block, 0, stream>>>(adj.data(), params.n_rows, params.n_cols, params.divisor);
-
+    init_adj(adj.data(), params.n_rows, params.n_cols, params.divisor, stream);
     // Initialize row_ind
     for (size_t i = 0; i < row_ind_host.size(); ++i) {
       size_t nnz_per_row = raft::ceildiv(params.n_cols, params.divisor);
@@ -141,13 +152,13 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
 
   void Run()
   {
-    convert::dense_bool_to_unsorted_csr<index_t>(handle,
-                                                 adj.data(),
-                                                 row_ind.data(),
-                                                 params.n_rows,
-                                                 params.n_cols,
-                                                 row_counters.data(),
-                                                 col_ind.data());
+    convert::adj_to_csr<index_t>(handle,
+                                 adj.data(),
+                                 row_ind.data(),
+                                 params.n_rows,
+                                 params.n_cols,
+                                 row_counters.data(),
+                                 col_ind.data());
 
     std::vector<index_t> col_ind_host(col_ind.size());
     raft::update_host(col_ind_host.data(), col_ind.data(), col_ind.size(), stream);
@@ -158,7 +169,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
     // 1. Check that each row contains enough values
     index_t nnz_per_row = raft::ceildiv(params.n_cols, params.divisor);
     for (index_t i = 0; i < params.n_rows; ++i) {
-      ASSERT_EQ(row_counters_host[i], nnz_per_row);
+      ASSERT_EQ(row_counters_host[i], nnz_per_row) << "where i = " << i;
     }
     // 2. Check that all column indices are divisble by divisor
     for (index_t i = 0; i < params.n_rows; ++i) {
@@ -189,6 +200,7 @@ TEST_P(CSRAdjGraphTestL, Result) { Run(); }
 
 const std::vector<CSRAdjGraphInputs<int>> csradjgraph_inputs_i     = {{10, 10, 2}};
 const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
+  {0, 0, 2},
   {10, 10, 2},
   {64 * 1024 + 10, 2, 3},  // 64K + 10 is slightly over maximum of blockDim.y
   {16, 16, 3},             // No peeling-remainder