Skip to content

Commit

Permalink
Add launch bounds to adj_to_csr_kernel
Browse files Browse the repository at this point in the history
Also: reduce threads per block from 1024 to 512. This improves
performance slightly on upcoming compute architectures.
  • Loading branch information
ahendriksen committed Aug 9, 2022
1 parent 8ab260e commit c546ff5
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 12 deletions.
2 changes: 0 additions & 2 deletions cpp/bench/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
* limitations under the License.
*/

#include <cooperative_groups.h>
#include <cooperative_groups/scan.h>
#include <stdio.h>

#include <common/benchmark.hpp>
Expand Down
23 changes: 13 additions & 10 deletions cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ namespace sparse {
namespace convert {
namespace detail {

// Threads per block in adj_to_csr_kernel.
const int adj_to_csr_tpb = 512;

/**
* @brief Convert dense adjacency matrix into unsorted CSR format.
*
Expand Down Expand Up @@ -58,13 +61,14 @@ namespace detail {
* the number of non-zeros in `adj`.
*/
template <typename index_t>
__global__ void adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* row_counters, // pre-allocated (zeroed) atomic counters
index_t* out_col_ind // output column indices
)
__global__ void __launch_bounds__(adj_to_csr_tpb)
adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* row_counters, // pre-allocated (zeroed) atomic counters
index_t* out_col_ind // output column indices
)
{
const int chunk_size = 16;
typedef raft::TxN_t<bool, chunk_size> chunk_bool;
Expand Down Expand Up @@ -148,17 +152,16 @@ void adj_to_csr(const raft::handle_t& handle,
// independently). If the maximum number of active blocks (num_sms *
// occupancy) exceeds the number of rows, assign multiple blocks to a single
// row.
int threads_per_block = 1024;
int dev_id, sm_count, blocks_per_sm;
cudaGetDevice(&dev_id);
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, adj_to_csr_kernel<index_t>, threads_per_block, 0);
&blocks_per_sm, adj_to_csr_kernel<index_t>, adj_to_csr_tpb, 0);

index_t max_active_blocks = sm_count * blocks_per_sm;
index_t blocks_per_row = raft::ceildiv(max_active_blocks, num_rows);
index_t grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row);
dim3 block(threads_per_block, 1);
dim3 block(adj_to_csr_tpb, 1);
dim3 grid(blocks_per_row, grid_rows);

adj_to_csr_kernel<index_t>
Expand Down

0 comments on commit c546ff5

Please sign in to comment.