From c546ff5aead21f9631724d513948a88110de91c9 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Tue, 9 Aug 2022 14:00:22 +0200 Subject: [PATCH] Add launch bounds to adj_to_csr_kernel Also: reduce threads per block from 1024 to 512. This improves performance slightly on upcoming compute architectures. --- cpp/bench/sparse/convert_csr.cu | 2 -- .../raft/sparse/convert/detail/adj_to_csr.cuh | 23 +++++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu index 0e701518ab..830fab13cc 100644 --- a/cpp/bench/sparse/convert_csr.cu +++ b/cpp/bench/sparse/convert_csr.cu @@ -14,8 +14,6 @@ * limitations under the License. */ -#include -#include #include #include diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh index e55627c936..337443393a 100644 --- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh +++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh @@ -29,6 +29,9 @@ namespace sparse { namespace convert { namespace detail { +// Threads per block in adj_to_csr_kernel. +const int adj_to_csr_tpb = 512; + /** * @brief Convert dense adjacency matrix into unsorted CSR format. * @@ -58,13 +61,14 @@ namespace detail { * the number of non-zeros in `adj`. */ template -__global__ void adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* row_counters, // pre-allocated (zeroed) atomic counters - index_t* out_col_ind // output column indices -) +__global__ void __launch_bounds__(adj_to_csr_tpb) + adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix + const index_t* row_ind, // precomputed row indices + index_t num_rows, // # rows of adj + index_t num_cols, // # cols of adj + index_t* row_counters, // pre-allocated (zeroed) atomic counters + index_t* out_col_ind // output column indices + ) { const int chunk_size = 16; typedef raft::TxN_t chunk_bool; @@ -148,17 +152,16 @@ void adj_to_csr(const raft::handle_t& handle, // independently). If the maximum number of active blocks (num_sms * // occupancy) exceeds the number of rows, assign multiple blocks to a single // row. - int threads_per_block = 1024; int dev_id, sm_count, blocks_per_sm; cudaGetDevice(&dev_id); cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks_per_sm, adj_to_csr_kernel, threads_per_block, 0); + &blocks_per_sm, adj_to_csr_kernel, adj_to_csr_tpb, 0); index_t max_active_blocks = sm_count * blocks_per_sm; index_t blocks_per_row = raft::ceildiv(max_active_blocks, num_rows); index_t grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row); - dim3 block(threads_per_block, 1); + dim3 block(adj_to_csr_tpb, 1); dim3 grid(blocks_per_row, grid_rows); adj_to_csr_kernel