Add launch bounds to adj_to_csr_kernel

Also: reduce threads per block from 1024 to 512. This improves performance slightly on upcoming compute architectures.
rapidsai · Aug 9, 2022 · c546ff5 · c546ff5
1 parent 8ab260e
commit c546ff5
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 12 deletions.
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <cooperative_groups.h>
-#include <cooperative_groups/scan.h>
 #include <stdio.h>
 
 #include <common/benchmark.hpp>

diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -29,6 +29,9 @@ namespace sparse {
 namespace convert {
 namespace detail {
 
+// Threads per block in adj_to_csr_kernel.
+const int adj_to_csr_tpb = 512;
+
 /**
  * @brief Convert dense adjacency matrix into unsorted CSR format.
  *
@@ -58,13 +61,14 @@ namespace detail {
  *                             the number of non-zeros in `adj`.
  */
 template <typename index_t>
-__global__ void adj_to_csr_kernel(const bool* adj,         // row-major adjacency matrix
-                                  const index_t* row_ind,  // precomputed row indices
-                                  index_t num_rows,        // # rows of adj
-                                  index_t num_cols,        // # cols of adj
-                                  index_t* row_counters,   // pre-allocated (zeroed) atomic counters
-                                  index_t* out_col_ind     // output column indices
-)
+__global__ void __launch_bounds__(adj_to_csr_tpb)
+  adj_to_csr_kernel(const bool* adj,         // row-major adjacency matrix
+                    const index_t* row_ind,  // precomputed row indices
+                    index_t num_rows,        // # rows of adj
+                    index_t num_cols,        // # cols of adj
+                    index_t* row_counters,   // pre-allocated (zeroed) atomic counters
+                    index_t* out_col_ind     // output column indices
+  )
 {
   const int chunk_size = 16;
   typedef raft::TxN_t<bool, chunk_size> chunk_bool;
@@ -148,17 +152,16 @@ void adj_to_csr(const raft::handle_t& handle,
   // independently). If the maximum number of active blocks (num_sms *
   // occupancy) exceeds the number of rows, assign multiple blocks to a single
   // row.
-  int threads_per_block = 1024;
   int dev_id, sm_count, blocks_per_sm;
   cudaGetDevice(&dev_id);
   cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm, adj_to_csr_kernel<index_t>, threads_per_block, 0);
+    &blocks_per_sm, adj_to_csr_kernel<index_t>, adj_to_csr_tpb, 0);
 
   index_t max_active_blocks = sm_count * blocks_per_sm;
   index_t blocks_per_row    = raft::ceildiv(max_active_blocks, num_rows);
   index_t grid_rows         = raft::ceildiv(max_active_blocks, blocks_per_row);
-  dim3 block(threads_per_block, 1);
+  dim3 block(adj_to_csr_tpb, 1);
   dim3 grid(blocks_per_row, grid_rows);
 
   adj_to_csr_kernel<index_t>