Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove duplicate adj_to_csr implementation #4829

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 3 additions & 104 deletions cpp/src/dbscan/adjgraph/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,95 +24,15 @@
#include "pack.h"

#include <raft/cuda_utils.cuh>
#include <raft/device_atomics.cuh>
#include <raft/handle.hpp>
#include <raft/vectorized.cuh>
#include <raft/sparse/convert/csr.cuh>
#include <rmm/device_uvector.hpp>

namespace ML {
namespace Dbscan {
namespace AdjGraph {
namespace Algo {

/**
* @brief Convert a boolean adjacency matrix into CSR format.
*
* The adj_to_csr kernel converts a boolean adjacency matrix into CSR format.
* High performance comes at the cost of non-deterministic output: the column
* indices are not guaranteed to be stored in order.
*
* The kernel has been optimized to handle matrices that are non-square, for
* instance subsets of a full adjacency matrix. In practice, these matrices can
* be very wide and not very tall. In principle, each row is assigned to one
* block. If there are more SMs than rows, multiple blocks operate on a single
* row. To enable cooperation between these blocks, each row is provided a
* counter where the current output index can be cooperatively (atomically)
* incremented. As a result, the order of the output indices is not guaranteed
* to be in order.
*
* @param[in] adj: a num_rows x num_cols boolean matrix in contiguous row-major
* format.
*
* @param[in] row_ind: an array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* `adj`.
*
* @param[in] num_rows: number of rows of adj.
* @param[in] num_cols: number of columns of adj.
*
* @param[in,out] row_counters: a temporary zero-initialized array of length num_rows.
*
* @param[out] out_col_ind: an array containing the column indices of the
* non-zero values in `adj`. Size should be at least
* the number of non-zeros in `adj`.
*/
template <typename index_t>
__global__ void adj_to_csr(const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* row_counters, // pre-allocated (zeroed) atomic counters
index_t* out_col_ind // output column indices
)
{
typedef raft::TxN_t<bool, 16> bool16;

for (index_t i = blockIdx.y; i < num_rows; i += gridDim.y) {
// Load row information
index_t row_base = row_ind[i];
index_t* row_count = row_counters + i;
const bool* row = adj + i * num_cols;

// Peeling: process the first j0 elements that are not aligned to a 16-byte
// boundary.
index_t j0 = (16 - (((uintptr_t)(const void*)row) % 16)) % 16;
j0 = min(j0, num_cols);
if (threadIdx.x < j0 && blockIdx.x == 0) {
if (row[threadIdx.x]) { out_col_ind[row_base + atomicIncWarp(row_count)] = threadIdx.x; }
}

// Process the rest of the row in 16 byte chunks starting at j0.
// This is a grid-stride loop.
index_t j = j0 + 16 * (blockIdx.x * blockDim.x + threadIdx.x);
for (; j + 15 < num_cols; j += 16 * (blockDim.x * gridDim.x)) {
bool16 chunk;
chunk.load(row, j);

for (int k = 0; k < 16; ++k) {
if (chunk.val.data[k]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j + k; }
}
}

// Remainder: process the last j1 bools in the row individually.
index_t j1 = (num_cols - j0) % 16;
if (threadIdx.x < j1 && blockIdx.x == 0) {
int j = num_cols - j1 + threadIdx.x;
if (row[j]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j; }
}
}
}

/**
* @brief Converts a boolean adjacency matrix into CSR format.
*
Expand Down Expand Up @@ -144,29 +64,8 @@ void launcher(const raft::handle_t& handle,
device_ptr<Index_> dev_ex_scan = device_pointer_cast(data.ex_scan);
thrust::exclusive_scan(handle.get_thrust_policy(), dev_vd, dev_vd + batch_size, dev_ex_scan);

// Zero-fill a temporary vector that can be used by the adj_to_csr kernel to
// keep track of the number of entries added to a row.
RAFT_CUDA_TRY(cudaMemsetAsync(row_counters, 0, batch_size * sizeof(Index_), stream));

// Split the grid in the row direction (since each row can be processed
// independently). If the maximum number of active blocks (num_sms *
// occupancy) exceeds the number of rows, assign multiple blocks to a single
// row.
int threads_per_block = 1024;
int dev_id, sm_count, blocks_per_sm;
cudaGetDevice(&dev_id);
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, adj_to_csr<Index_>, threads_per_block, 0);

Index_ max_active_blocks = sm_count * blocks_per_sm;
Index_ blocks_per_row = raft::ceildiv(max_active_blocks, num_rows);
Index_ grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row);
dim3 block(threads_per_block, 1);
dim3 grid(blocks_per_row, grid_rows);

adj_to_csr<Index_><<<grid, block, 0, stream>>>(
adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
raft::sparse::convert::adj_to_csr(
handle, adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

Expand Down