Skip to content

Commit

Permalink
Remove duplicate adj_to_csr implementation (#4829)
Browse files Browse the repository at this point in the history
This functionality has been moved to RAFT.

Authors:
  - Allard Hendriksen (https://github.com/ahendriksen)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #4829
  • Loading branch information
Allard Hendriksen authored Jul 25, 2022
1 parent 2fad4c0 commit 092c4de
Showing 1 changed file with 3 additions and 104 deletions.
107 changes: 3 additions & 104 deletions cpp/src/dbscan/adjgraph/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,95 +24,15 @@
#include "pack.h"

#include <raft/cuda_utils.cuh>
#include <raft/device_atomics.cuh>
#include <raft/handle.hpp>
#include <raft/vectorized.cuh>
#include <raft/sparse/convert/csr.cuh>
#include <rmm/device_uvector.hpp>

namespace ML {
namespace Dbscan {
namespace AdjGraph {
namespace Algo {

/**
* @brief Convert a boolean adjacency matrix into CSR format.
*
* The adj_to_csr kernel converts a boolean adjacency matrix into CSR format.
* High performance comes at the cost of non-deterministic output: the column
* indices are not guaranteed to be stored in order.
*
* The kernel has been optimized to handle matrices that are non-square, for
* instance subsets of a full adjacency matrix. In practice, these matrices can
* be very wide and not very tall. In principle, each row is assigned to one
* block. If there are more SMs than rows, multiple blocks operate on a single
* row. To enable cooperation between these blocks, each row is provided a
* counter where the current output index can be cooperatively (atomically)
* incremented. As a result, the order of the output indices is not guaranteed
* to be in order.
*
* @param[in] adj: a num_rows x num_cols boolean matrix in contiguous row-major
* format.
*
* @param[in] row_ind: an array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* `adj`.
*
* @param[in] num_rows: number of rows of adj.
* @param[in] num_cols: number of columns of adj.
*
* @param[in,out] row_counters: a temporary zero-initialized array of length num_rows.
*
* @param[out] out_col_ind: an array containing the column indices of the
* non-zero values in `adj`. Size should be at least
* the number of non-zeros in `adj`.
*/
template <typename index_t>
__global__ void adj_to_csr(const bool* adj, // row-major adjacency matrix
const index_t* row_ind, // precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* row_counters, // pre-allocated (zeroed) atomic counters
index_t* out_col_ind // output column indices
)
{
typedef raft::TxN_t<bool, 16> bool16;

for (index_t i = blockIdx.y; i < num_rows; i += gridDim.y) {
// Load row information
index_t row_base = row_ind[i];
index_t* row_count = row_counters + i;
const bool* row = adj + i * num_cols;

// Peeling: process the first j0 elements that are not aligned to a 16-byte
// boundary.
index_t j0 = (16 - (((uintptr_t)(const void*)row) % 16)) % 16;
j0 = min(j0, num_cols);
if (threadIdx.x < j0 && blockIdx.x == 0) {
if (row[threadIdx.x]) { out_col_ind[row_base + atomicIncWarp(row_count)] = threadIdx.x; }
}

// Process the rest of the row in 16 byte chunks starting at j0.
// This is a grid-stride loop.
index_t j = j0 + 16 * (blockIdx.x * blockDim.x + threadIdx.x);
for (; j + 15 < num_cols; j += 16 * (blockDim.x * gridDim.x)) {
bool16 chunk;
chunk.load(row, j);

for (int k = 0; k < 16; ++k) {
if (chunk.val.data[k]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j + k; }
}
}

// Remainder: process the last j1 bools in the row individually.
index_t j1 = (num_cols - j0) % 16;
if (threadIdx.x < j1 && blockIdx.x == 0) {
int j = num_cols - j1 + threadIdx.x;
if (row[j]) { out_col_ind[row_base + atomicIncWarp(row_count)] = j; }
}
}
}

/**
* @brief Converts a boolean adjacency matrix into CSR format.
*
Expand Down Expand Up @@ -144,29 +64,8 @@ void launcher(const raft::handle_t& handle,
device_ptr<Index_> dev_ex_scan = device_pointer_cast(data.ex_scan);
thrust::exclusive_scan(handle.get_thrust_policy(), dev_vd, dev_vd + batch_size, dev_ex_scan);

// Zero-fill a temporary vector that can be used by the adj_to_csr kernel to
// keep track of the number of entries added to a row.
RAFT_CUDA_TRY(cudaMemsetAsync(row_counters, 0, batch_size * sizeof(Index_), stream));

// Split the grid in the row direction (since each row can be processed
// independently). If the maximum number of active blocks (num_sms *
// occupancy) exceeds the number of rows, assign multiple blocks to a single
// row.
int threads_per_block = 1024;
int dev_id, sm_count, blocks_per_sm;
cudaGetDevice(&dev_id);
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, adj_to_csr<Index_>, threads_per_block, 0);

Index_ max_active_blocks = sm_count * blocks_per_sm;
Index_ blocks_per_row = raft::ceildiv(max_active_blocks, num_rows);
Index_ grid_rows = raft::ceildiv(max_active_blocks, blocks_per_row);
dim3 block(threads_per_block, 1);
dim3 grid(blocks_per_row, grid_rows);

adj_to_csr<Index_><<<grid, block, 0, stream>>>(
adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
raft::sparse::convert::adj_to_csr(
handle, adj, data.ex_scan, num_rows, num_cols, row_counters, data.adj_graph);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

Expand Down

0 comments on commit 092c4de

Please sign in to comment.