Skip to content

Commit

Permalink
sparse: Implement review feedback
Browse files Browse the repository at this point in the history
- Rename dense_bool_to_unsorted_csr to adj_to_csr
- Add grid-stride loops for test case generation (both bench and test)
- Remove overload

In addition:
- Add test case for empty input
- Fix behavior in case of empty input (return early)
  • Loading branch information
ahendriksen committed Jul 20, 2022
1 parent 598f77b commit 07bdc85
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 131 deletions.
55 changes: 34 additions & 21 deletions cpp/bench/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,36 +24,50 @@

namespace raft::bench::sparse {

template <typename index_t>
struct bench_param {
size_t num_cols;
size_t num_rows;
size_t divisor;
index_t num_cols;
index_t num_rows;
index_t divisor;
};

template <typename index_t>
__global__ void init_adj(bool* adj, index_t num_rows, index_t num_cols, int divisor)
__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
{
index_t r = blockDim.y * blockIdx.y + threadIdx.y;
index_t c = blockDim.x * blockIdx.x + threadIdx.x;

if (r < num_rows && c < num_cols) { adj[r * num_cols + c] = c % divisor == 0; }
for (; r < num_rows; r += gridDim.y * blockDim.y) {
for (; c < num_cols; c += gridDim.x * blockDim.x) {
adj[r * num_cols + c] = c % divisor == 0;
}
}
}

template <typename index_t>
void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
{
// adj matrix: element a_ij is set to one if j is divisible by divisor.
dim3 block(32, 32);
const index_t max_y_grid_dim = 65535;
dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
RAFT_CHECK_CUDA(stream);
}

template <typename index_t>
struct bench_base : public fixture {
bench_base(const bench_param& p)
bench_base(const bench_param<index_t>& p)
: params(p),
handle(stream),
adj(p.num_rows * p.num_cols, stream),
row_ind(p.num_rows, stream),
row_ind_host(p.num_rows),
row_counters(p.num_rows, stream),
col_ind(p.num_rows * p.num_cols,
stream) // This is over-dimensioned because nnz is unknown at this point
// col_ind is over-dimensioned because nnz is unknown at this point
col_ind(p.num_rows * p.num_cols, stream)
{
dim3 block(32, 32);
dim3 grid(p.num_cols / 32 + 1, p.num_rows / 32 + 1);
init_adj<index_t><<<grid, block, 0, stream>>>(adj.data(), p.num_rows, p.num_cols, p.divisor);
init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);

std::vector<index_t> row_ind_host(p.num_rows);
for (size_t i = 0; i < row_ind_host.size(); ++i) {
Expand All @@ -66,13 +80,13 @@ struct bench_base : public fixture {
void run_benchmark(::benchmark::State& state) override
{
loop_on_state(state, [this]() {
raft::sparse::convert::dense_bool_to_unsorted_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.num_rows,
params.num_cols,
row_counters.data(),
col_ind.data());
raft::sparse::convert::adj_to_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.num_rows,
params.num_cols,
row_counters.data(),
col_ind.data());
});

// Estimate bandwidth:
Expand All @@ -96,7 +110,7 @@ struct bench_base : public fixture {

protected:
raft::handle_t handle;
bench_param params;
bench_param<index_t> params;
rmm::device_uvector<bool> adj;
rmm::device_uvector<index_t> row_ind;
std::vector<index_t> row_ind_host;
Expand All @@ -106,7 +120,7 @@ struct bench_base : public fixture {

const int64_t num_cols = 1 << 30;

const std::vector<bench_param> bench_params = {
const std::vector<bench_param<int64_t>> bench_params = {
{num_cols, 1, 8},
{num_cols >> 3, 1 << 3, 8},
{num_cols >> 6, 1 << 6, 8},
Expand All @@ -121,6 +135,5 @@ const std::vector<bench_param> bench_params = {
};

RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);
// RAFT_BENCH_REGISTER(bench_base<int>, "", bench_params);

} // namespace raft::bench::sparse
51 changes: 9 additions & 42 deletions cpp/include/raft/sparse/convert/csr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

#pragma once

#include <raft/sparse/convert/detail/adj_to_csr.cuh>
#include <raft/sparse/convert/detail/csr.cuh>
#include <raft/sparse/convert/detail/dense_to_csr.cuh>
#include <raft/sparse/csr.hpp>

namespace raft {
Expand Down Expand Up @@ -90,49 +90,16 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
* number of non-zeros in adj.
*/
template <typename index_t = int>
void dense_bool_to_unsorted_csr(
const raft::handle_t& handle,
const bool* adj, // Row-major adjacency matrix
const index_t* row_ind, // Precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements.
index_t* out_col_ind // Output column indices
void adj_to_csr(const raft::handle_t& handle,
const bool* adj, // Row-major adjacency matrix
const index_t* row_ind, // Precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements.
index_t* out_col_ind // Output column indices
)
{
detail::dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
}

/**
* @brief Converts a boolean adjacency matrix into unsorted CSR format.
*
* The conversion supports non-square matrices.
*
* @tparam index_t Indexing arithmetic type
*
* @param[in] handle RAFT handle
* @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major
* format.
* @param[in] row_ind An array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* adj.
* @param[in] num_rows Number of rows of adj.
* @param[in] num_cols Number of columns of adj.
* @param[out] out_col_ind An array containing the column indices of the
* non-zero values in adj. Size should be at least the
* number of non-zeros in adj.
*/
template <typename index_t = int>
void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* out_col_ind // output column indices
)
{
detail::dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, out_col_ind);
detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
}

}; // end NAMESPACE convert
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ namespace detail {
* block. If there are more SMs than rows, multiple blocks operate on a single
* row. To enable cooperation between these blocks, each row is provided a
* counter where the current output index can be cooperatively (atomically)
* incremented. As a result, the order of the output indices is not guaranteed
* to be in order.
* incremented. As a result, the order of the output indices is not guaranteed.
*
* @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major
* format.
Expand All @@ -59,13 +58,12 @@ namespace detail {
* the number of non-zeros in `adj`.
*/
template <typename index_t>
__global__ void dense_bool_to_unsorted_csr_kernel(
const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* row_counters, // pre-allocated (zeroed) atomic counters
index_t* out_col_ind // output column indices
__global__ void adj_to_csr_kernel(const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* row_counters, // pre-allocated (zeroed) atomic counters
index_t* out_col_ind // output column indices
)
{
const int chunk_size = 16;
Expand Down Expand Up @@ -127,18 +125,20 @@ __global__ void dense_bool_to_unsorted_csr_kernel(
* number of non-zeros in adj.
*/
template <typename index_t = int>
void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // pre-allocated atomic counters
index_t* out_col_ind // output column indices
void adj_to_csr(const raft::handle_t& handle,
const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // pre-allocated atomic counters
index_t* out_col_ind // output column indices
)
{
auto stream = handle.get_stream();

RAFT_EXPECTS(tmp != nullptr, "dense_bool_to_unsorted_csr: tmp workspace may not be null.");
// Check inputs and return early if possible.
if (num_rows == 0 || num_cols == 0) { return; }
RAFT_EXPECTS(tmp != nullptr, "adj_to_csr: tmp workspace may not be null.");

// Zero-fill a temporary vector that is be used by the kernel to keep track of
// the number of entries added to a row.
Expand All @@ -153,53 +153,19 @@ void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
cudaGetDevice(&dev_id);
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, dense_bool_to_unsorted_csr_kernel<index_t>, threads_per_block, 0);
&blocks_per_sm, adj_to_csr_kernel<index_t>, threads_per_block, 0);

index_t max_active_blocks = sm_count * blocks_per_sm;
index_t blocks_per_row = raft::ceildiv(max_active_blocks, num_rows);
index_t grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row);
dim3 block(threads_per_block, 1);
dim3 grid(blocks_per_row, grid_rows);

dense_bool_to_unsorted_csr_kernel<index_t>
adj_to_csr_kernel<index_t>
<<<grid, block, 0, stream>>>(adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

/**
* @brief Converts a boolean adjacency matrix into unsorted CSR format.
*
* The conversion supports non-square matrices.
*
* @tparam index_t Indexing arithmetic type
*
* @param[in] handle RAFT handle
* @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major
* format.
* @param[in] row_ind An array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* adj.
* @param[in] num_rows Number of rows of adj.
* @param[in] num_cols Number of columns of adj.
* @param[out] out_col_ind An array containing the column indices of the
* non-zero values in adj. Size should be at least the
* number of non-zeros in adj.
*/
template <typename index_t = int>
void dense_bool_to_unsorted_csr(const raft::handle_t& handle,
const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* out_col_ind // output column indices
)
{
auto stream = handle.get_stream();
rmm::device_uvector<index_t> tmp(num_rows, stream);
dense_bool_to_unsorted_csr(handle, adj, row_ind, num_rows, num_cols, tmp.data(), out_col_ind);
}

}; // end NAMESPACE detail
}; // end NAMESPACE convert
}; // end NAMESPACE sparse
Expand Down
42 changes: 27 additions & 15 deletions cpp/test/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,27 @@ INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesI
/******************************** adj graph ********************************/

template <typename index_t>
__global__ void init_adj(bool* adj, index_t num_rows, index_t num_cols, int divisor)
__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
{
index_t r = blockDim.y * blockIdx.y + threadIdx.y;
index_t c = blockDim.x * blockIdx.x + threadIdx.x;

if (r < num_rows && c < num_cols) { adj[r * num_cols + c] = c % divisor == 0; }
for (; r < num_rows; r += gridDim.y * blockDim.y) {
for (; c < num_cols; c += gridDim.x * blockDim.x) {
adj[r * num_cols + c] = c % divisor == 0;
}
}
}

template <typename index_t>
void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
{
// adj matrix: element a_ij is set to one if j is divisible by divisor.
dim3 block(32, 32);
const index_t max_y_grid_dim = 65535;
dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
RAFT_CHECK_CUDA(stream);
}

template <typename index_t>
Expand Down Expand Up @@ -123,11 +138,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
{
// Initialize adj matrix: element a_ij equals one if j is divisible by
// params.divisor.
dim3 block(32, 32);
dim3 grid(params.n_cols / 32 + 1, params.n_rows / 32 + 1);
init_adj<index_t>
<<<grid, block, 0, stream>>>(adj.data(), params.n_rows, params.n_cols, params.divisor);

init_adj(adj.data(), params.n_rows, params.n_cols, params.divisor, stream);
// Initialize row_ind
for (size_t i = 0; i < row_ind_host.size(); ++i) {
size_t nnz_per_row = raft::ceildiv(params.n_cols, params.divisor);
Expand All @@ -141,13 +152,13 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_

void Run()
{
convert::dense_bool_to_unsorted_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.n_rows,
params.n_cols,
row_counters.data(),
col_ind.data());
convert::adj_to_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.n_rows,
params.n_cols,
row_counters.data(),
col_ind.data());

std::vector<index_t> col_ind_host(col_ind.size());
raft::update_host(col_ind_host.data(), col_ind.data(), col_ind.size(), stream);
Expand All @@ -158,7 +169,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
// 1. Check that each row contains enough values
index_t nnz_per_row = raft::ceildiv(params.n_cols, params.divisor);
for (index_t i = 0; i < params.n_rows; ++i) {
ASSERT_EQ(row_counters_host[i], nnz_per_row);
ASSERT_EQ(row_counters_host[i], nnz_per_row) << "where i = " << i;
}
// 2. Check that all column indices are divisble by divisor
for (index_t i = 0; i < params.n_rows; ++i) {
Expand Down Expand Up @@ -189,6 +200,7 @@ TEST_P(CSRAdjGraphTestL, Result) { Run(); }

const std::vector<CSRAdjGraphInputs<int>> csradjgraph_inputs_i = {{10, 10, 2}};
const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
{0, 0, 2},
{10, 10, 2},
{64 * 1024 + 10, 2, 3}, // 64K + 10 is slightly over maximum of blockDim.y
{16, 16, 3}, // No peeling-remainder
Expand Down

0 comments on commit 07bdc85

Please sign in to comment.