rapidsai · rapids-bot · Jul 25, 2022 · Jul 25, 2022
@@ -24,95 +24,15 @@
 #include "pack.h"
 
 #include <raft/cuda_utils.cuh>
-#include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
-#include <raft/vectorized.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace ML {
 namespace Dbscan {
 namespace AdjGraph {
 namespace Algo {
 
-/**
- * @brief Convert a boolean adjacency matrix into CSR format.
- *
- * The adj_to_csr kernel converts a boolean adjacency matrix into CSR format.
- * High performance comes at the cost of non-deterministic output: the column
- * indices are not guaranteed to be stored in order.
- *
- * The kernel has been optimized to handle matrices that are non-square, for
- * instance subsets of a full adjacency matrix. In practice, these matrices can
- * be very wide and not very tall. In principle, each row is assigned to one
- * block. If there are more SMs than rows, multiple blocks operate on a single
- * row. To enable cooperation between these blocks, each row is provided a
- * counter where the current output index can be cooperatively (atomically)
- * incremented. As a result, the order of the output indices is not guaranteed
- * to be in order.
- *
- * @param[in] adj: a num_rows x num_cols boolean matrix in contiguous row-major
- *                 format.
- *
- * @param[in] row_ind: an array of length num_rows that indicates at which index
- *                     a row starts in out_col_ind. Equivalently, it is the
- *                     exclusive scan of the number of non-zeros in each row of
- *                     `adj`.
- *
- * @param[in] num_rows: number of rows of adj.
- * @param[in] num_cols: number of columns of adj.
- *
- * @param[in,out] row_counters: a temporary zero-initialized array of length num_rows.
- *
- * @param[out] out_col_ind: an array containing the column indices of the
- *                          non-zero values in `adj`. Size should be at least
- *                          the number of non-zeros in `adj`.
- */
-template <typename index_t>
-__global__ void adj_to_csr(const bool* adj,         // row-major adjacency matrix
-                           const index_t* row_ind,  // precomputed row indices
-                           index_t num_rows,        // # rows of adj
-                           index_t num_cols,        // # cols of adj
-                           index_t* row_counters,   // pre-allocated (zeroed) atomic counters
-                           index_t* out_col_ind     // output column indices
-)
-{
-  typedef raft::TxN_t<bool, 16> bool16;
-
-  for (index_t i = blockIdx.y; i < num_rows; i += gridDim.y) {
-    // Load row information
-    index_t row_base   = row_ind[i];
-    index_t* row_count = row_counters + i;
-    const bool* row    = adj + i * num_cols;
-
-    // Peeling: process the first j0 elements that are not aligned to a 16-byte
-    // boundary.
-    index_t j0 = (16 - (((uintptr_t)(const void*)row) % 16)) % 16;
-    j0         = min(j0, num_cols);
-    if (threadIdx.x < j0 && blockIdx.x == 0) {
-      if (row[threadIdx.x]) { out_col_ind[row_base + atomicIncWarp(row_count)] = threadIdx.x; }
-    }
-
-    // Process the rest of the row in 16 byte chunks starting at j0.
-    // This is a grid-stride loop.
-    index_t j = j0 + 16 * (blockIdx.x * blockDim.x + threadIdx.x);
-    for (; j + 15 < num_cols; j += 16 * (blockDim.x * gridDim.x)) {
-      bool16 chunk;
-      chunk.load(row, j);
-
-      for (int k = 0; k < 16; ++k) {
-        if (chunk.val.data[k]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j + k; }
-      }
-    }
-
-    // Remainder: process the last j1 bools in the row individually.
-    index_t j1 = (num_cols - j0) % 16;
-    if (threadIdx.x < j1 && blockIdx.x == 0) {
-      int j = num_cols - j1 + threadIdx.x;
-      if (row[j]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j; }
-    }
-  }
-}
-
 /**
  * @brief Converts a boolean adjacency matrix into CSR format.
  *
@@ -144,29 +64,8 @@ void launcher(const raft::handle_t& handle,
   device_ptr<Index_> dev_ex_scan = device_pointer_cast(data.ex_scan);
   thrust::exclusive_scan(handle.get_thrust_policy(), dev_vd, dev_vd + batch_size, dev_ex_scan);
 
-  // Zero-fill a temporary vector that can be used by the adj_to_csr kernel to
-  // keep track of the number of entries added to a row.
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_counters, 0, batch_size * sizeof(Index_), stream));
-
-  // Split the grid in the row direction (since each row can be processed
-  // independently). If the maximum number of active blocks (num_sms *
-  // occupancy) exceeds the number of rows, assign multiple blocks to a single
-  // row.
-  int threads_per_block = 1024;
-  int dev_id, sm_count, blocks_per_sm;
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm, adj_to_csr<Index_>, threads_per_block, 0);
-
-  Index_ max_active_blocks = sm_count * blocks_per_sm;
-  Index_ blocks_per_row    = raft::ceildiv(max_active_blocks, num_rows);
-  Index_ grid_rows         = raft::ceildiv(max_active_blocks, blocks_per_row);
-  dim3 block(threads_per_block, 1);
-  dim3 grid(blocks_per_row, grid_rows);
-
-  adj_to_csr<Index_><<<grid, block, 0, stream>>>(
-    adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
+  raft::sparse::convert::adj_to_csr(
+    handle, adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }