From 07bdc85029d59bfba1998eda9df65ec2e0b835fe Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Wed, 20 Jul 2022 11:42:01 +0200 Subject: [PATCH] sparse: Implement review feedback - Rename dense_bool_to_unsorted_csr to adj_to_csr - Add grid-stride loops for test case generation (both bench and test) - Remove overload In addition: - Add test case for empty input - Fix behavior in case of empty input (return early) --- cpp/bench/sparse/convert_csr.cu | 55 ++++++++------ cpp/include/raft/sparse/convert/csr.cuh | 51 +++---------- .../{dense_to_csr.cuh => adj_to_csr.cuh} | 72 +++++-------------- cpp/test/sparse/convert_csr.cu | 42 +++++++---- 4 files changed, 89 insertions(+), 131 deletions(-) rename cpp/include/raft/sparse/convert/detail/{dense_to_csr.cuh => adj_to_csr.cuh} (69%) diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu index dfcdc1ff90..0e701518ab 100644 --- a/cpp/bench/sparse/convert_csr.cu +++ b/cpp/bench/sparse/convert_csr.cu @@ -24,36 +24,50 @@ namespace raft::bench::sparse { +template struct bench_param { - size_t num_cols; - size_t num_rows; - size_t divisor; + index_t num_cols; + index_t num_rows; + index_t divisor; }; template -__global__ void init_adj(bool* adj, index_t num_rows, index_t num_cols, int divisor) +__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) { index_t r = blockDim.y * blockIdx.y + threadIdx.y; index_t c = blockDim.x * blockIdx.x + threadIdx.x; - if (r < num_rows && c < num_cols) { adj[r * num_cols + c] = c % divisor == 0; } + for (; r < num_rows; r += gridDim.y * blockDim.y) { + for (; c < num_cols; c += gridDim.x * blockDim.x) { + adj[r * num_cols + c] = c % divisor == 0; + } + } +} + +template +void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream) +{ + // adj matrix: element a_ij is set to one if j is divisible by divisor. + dim3 block(32, 32); + const index_t max_y_grid_dim = 65535; + dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim)); + init_adj_kernel<<>>(adj, num_rows, num_cols, divisor); + RAFT_CHECK_CUDA(stream); } template struct bench_base : public fixture { - bench_base(const bench_param& p) + bench_base(const bench_param& p) : params(p), handle(stream), adj(p.num_rows * p.num_cols, stream), row_ind(p.num_rows, stream), row_ind_host(p.num_rows), row_counters(p.num_rows, stream), - col_ind(p.num_rows * p.num_cols, - stream) // This is over-dimensioned because nnz is unknown at this point + // col_ind is over-dimensioned because nnz is unknown at this point + col_ind(p.num_rows * p.num_cols, stream) { - dim3 block(32, 32); - dim3 grid(p.num_cols / 32 + 1, p.num_rows / 32 + 1); - init_adj<<>>(adj.data(), p.num_rows, p.num_cols, p.divisor); + init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream); std::vector row_ind_host(p.num_rows); for (size_t i = 0; i < row_ind_host.size(); ++i) { @@ -66,13 +80,13 @@ struct bench_base : public fixture { void run_benchmark(::benchmark::State& state) override { loop_on_state(state, [this]() { - raft::sparse::convert::dense_bool_to_unsorted_csr(handle, - adj.data(), - row_ind.data(), - params.num_rows, - params.num_cols, - row_counters.data(), - col_ind.data()); + raft::sparse::convert::adj_to_csr(handle, + adj.data(), + row_ind.data(), + params.num_rows, + params.num_cols, + row_counters.data(), + col_ind.data()); }); // Estimate bandwidth: @@ -96,7 +110,7 @@ struct bench_base : public fixture { protected: raft::handle_t handle; - bench_param params; + bench_param params; rmm::device_uvector adj; rmm::device_uvector row_ind; std::vector row_ind_host; @@ -106,7 +120,7 @@ struct bench_base : public fixture { const int64_t num_cols = 1 << 30; -const std::vector bench_params = { +const std::vector> bench_params = { {num_cols, 1, 8}, {num_cols >> 3, 1 << 3, 8}, {num_cols >> 6, 1 << 6, 8}, @@ -121,6 +135,5 @@ const std::vector bench_params = { }; RAFT_BENCH_REGISTER(bench_base, "", bench_params); -// RAFT_BENCH_REGISTER(bench_base, "", bench_params); } // namespace raft::bench::sparse diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index 048e6469de..abdacdc426 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -18,8 +18,8 @@ #pragma once +#include #include -#include #include namespace raft { @@ -90,49 +90,16 @@ void sorted_coo_to_csr(COO* coo, int* row_ind, cudaStream_t stream) * number of non-zeros in adj. */ template -void dense_bool_to_unsorted_csr( - const raft::handle_t& handle, - const bool* adj, // Row-major adjacency matrix - const index_t* row_ind, // Precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements. - index_t* out_col_ind // Output column indices +void adj_to_csr(const raft::handle_t& handle, + const bool* adj, // Row-major adjacency matrix + const index_t* row_ind, // Precomputed row indices + index_t num_rows, // # rows of adj + index_t num_cols, // # cols of adj + index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements. + index_t* out_col_ind // Output column indices ) { - detail::dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind); -} - -/** - * @brief Converts a boolean adjacency matrix into unsorted CSR format. - * - * The conversion supports non-square matrices. - * - * @tparam index_t Indexing arithmetic type - * - * @param[in] handle RAFT handle - * @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major - * format. - * @param[in] row_ind An array of length num_rows that indicates at which index - * a row starts in out_col_ind. Equivalently, it is the - * exclusive scan of the number of non-zeros in each row of - * adj. - * @param[in] num_rows Number of rows of adj. - * @param[in] num_cols Number of columns of adj. - * @param[out] out_col_ind An array containing the column indices of the - * non-zero values in adj. Size should be at least the - * number of non-zeros in adj. - */ -template -void dense_bool_to_unsorted_csr(const raft::handle_t& handle, - const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* out_col_ind // output column indices -) -{ - detail::dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, out_col_ind); + detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/convert/detail/dense_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh similarity index 69% rename from cpp/include/raft/sparse/convert/detail/dense_to_csr.cuh rename to cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh index 6d28f486f1..16ed8c8c90 100644 --- a/cpp/include/raft/sparse/convert/detail/dense_to_csr.cuh +++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh @@ -42,8 +42,7 @@ namespace detail { * block. If there are more SMs than rows, multiple blocks operate on a single * row. To enable cooperation between these blocks, each row is provided a * counter where the current output index can be cooperatively (atomically) - * incremented. As a result, the order of the output indices is not guaranteed - * to be in order. + * incremented. As a result, the order of the output indices is not guaranteed. * * @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major * format. @@ -59,13 +58,12 @@ namespace detail { * the number of non-zeros in `adj`. */ template -__global__ void dense_bool_to_unsorted_csr_kernel( - const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* row_counters, // pre-allocated (zeroed) atomic counters - index_t* out_col_ind // output column indices +__global__ void adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix + const index_t* row_ind, // precomputed row indices + index_t num_rows, // # rows of adj + index_t num_cols, // # cols of adj + index_t* row_counters, // pre-allocated (zeroed) atomic counters + index_t* out_col_ind // output column indices ) { const int chunk_size = 16; @@ -127,18 +125,20 @@ __global__ void dense_bool_to_unsorted_csr_kernel( * number of non-zeros in adj. */ template -void dense_bool_to_unsorted_csr(const raft::handle_t& handle, - const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* tmp, // pre-allocated atomic counters - index_t* out_col_ind // output column indices +void adj_to_csr(const raft::handle_t& handle, + const bool* adj, // row-major adjacency matrix + const index_t* row_ind, // precomputed row indices + index_t num_rows, // # rows of adj + index_t num_cols, // # cols of adj + index_t* tmp, // pre-allocated atomic counters + index_t* out_col_ind // output column indices ) { auto stream = handle.get_stream(); - RAFT_EXPECTS(tmp != nullptr, "dense_bool_to_unsorted_csr: tmp workspace may not be null."); + // Check inputs and return early if possible. + if (num_rows == 0 || num_cols == 0) { return; } + RAFT_EXPECTS(tmp != nullptr, "adj_to_csr: tmp workspace may not be null."); // Zero-fill a temporary vector that is be used by the kernel to keep track of // the number of entries added to a row. @@ -153,7 +153,7 @@ void dense_bool_to_unsorted_csr(const raft::handle_t& handle, cudaGetDevice(&dev_id); cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks_per_sm, dense_bool_to_unsorted_csr_kernel, threads_per_block, 0); + &blocks_per_sm, adj_to_csr_kernel, threads_per_block, 0); index_t max_active_blocks = sm_count * blocks_per_sm; index_t blocks_per_row = raft::ceildiv(max_active_blocks, num_rows); @@ -161,45 +161,11 @@ void dense_bool_to_unsorted_csr(const raft::handle_t& handle, dim3 block(threads_per_block, 1); dim3 grid(blocks_per_row, grid_rows); - dense_bool_to_unsorted_csr_kernel + adj_to_csr_kernel <<>>(adj, row_ind, num_rows, num_cols, tmp, out_col_ind); RAFT_CUDA_TRY(cudaPeekAtLastError()); } -/** - * @brief Converts a boolean adjacency matrix into unsorted CSR format. - * - * The conversion supports non-square matrices. - * - * @tparam index_t Indexing arithmetic type - * - * @param[in] handle RAFT handle - * @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major - * format. - * @param[in] row_ind An array of length num_rows that indicates at which index - * a row starts in out_col_ind. Equivalently, it is the - * exclusive scan of the number of non-zeros in each row of - * adj. - * @param[in] num_rows Number of rows of adj. - * @param[in] num_cols Number of columns of adj. - * @param[out] out_col_ind An array containing the column indices of the - * non-zero values in adj. Size should be at least the - * number of non-zeros in adj. - */ -template -void dense_bool_to_unsorted_csr(const raft::handle_t& handle, - const bool* adj, // row-major adjacency matrix - const index_t* row_ind, // precomputed row indices - index_t num_rows, // # rows of adj - index_t num_cols, // # cols of adj - index_t* out_col_ind // output column indices -) -{ - auto stream = handle.get_stream(); - rmm::device_uvector tmp(num_rows, stream); - dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, tmp.data(), out_col_ind); -} - }; // end NAMESPACE detail }; // end NAMESPACE convert }; // end NAMESPACE sparse diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 4731fefa83..a217a90e19 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -89,12 +89,27 @@ INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesI /******************************** adj graph ********************************/ template -__global__ void init_adj(bool* adj, index_t num_rows, index_t num_cols, int divisor) +__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) { index_t r = blockDim.y * blockIdx.y + threadIdx.y; index_t c = blockDim.x * blockIdx.x + threadIdx.x; - if (r < num_rows && c < num_cols) { adj[r * num_cols + c] = c % divisor == 0; } + for (; r < num_rows; r += gridDim.y * blockDim.y) { + for (; c < num_cols; c += gridDim.x * blockDim.x) { + adj[r * num_cols + c] = c % divisor == 0; + } + } +} + +template +void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream) +{ + // adj matrix: element a_ij is set to one if j is divisible by divisor. + dim3 block(32, 32); + const index_t max_y_grid_dim = 65535; + dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim)); + init_adj_kernel<<>>(adj, num_rows, num_cols, divisor); + RAFT_CHECK_CUDA(stream); } template @@ -123,11 +138,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam - <<>>(adj.data(), params.n_rows, params.n_cols, params.divisor); - + init_adj(adj.data(), params.n_rows, params.n_cols, params.divisor, stream); // Initialize row_ind for (size_t i = 0; i < row_ind_host.size(); ++i) { size_t nnz_per_row = raft::ceildiv(params.n_cols, params.divisor); @@ -141,13 +152,13 @@ class CSRAdjGraphTest : public ::testing::TestWithParam(handle, - adj.data(), - row_ind.data(), - params.n_rows, - params.n_cols, - row_counters.data(), - col_ind.data()); + convert::adj_to_csr(handle, + adj.data(), + row_ind.data(), + params.n_rows, + params.n_cols, + row_counters.data(), + col_ind.data()); std::vector col_ind_host(col_ind.size()); raft::update_host(col_ind_host.data(), col_ind.data(), col_ind.size(), stream); @@ -158,7 +169,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam