Skip to content

Commit

Permalink
Replace csr_adj_graph functions with faster equivalent (#746)
Browse files Browse the repository at this point in the history
The `csr_adj_graph` functions are a performance bottleneck in the DBSCAN implementation in cuML. They are not used anywhere  else.

This PR replaces the `csr_adj_graph` functions with the faster `dense_bool_to_unsorted_csr` function. It has the same functionality, *but*

1. It requires the input adjacency matrix to be in row-major order (rather than column-major).
2. The output column indices are not guaranteed to be in ascending order (hence unsorted).

Authors:
  - Allard Hendriksen (https://github.com/ahendriksen)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: #746
  • Loading branch information
Allard Hendriksen authored Jul 22, 2022
1 parent fd2595c commit 362f91c
Show file tree
Hide file tree
Showing 10 changed files with 456 additions and 402 deletions.
1 change: 1 addition & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
bench/random/make_blobs.cu
bench/random/permute.cu
bench/random/rng.cu
bench/sparse/convert_csr.cu
bench/spatial/fused_l2_nn.cu
bench/spatial/knn.cu
bench/spatial/selection.cu
Expand Down
139 changes: 139 additions & 0 deletions cpp/bench/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cooperative_groups.h>
#include <cooperative_groups/scan.h>
#include <stdio.h>

#include <common/benchmark.hpp>
#include <raft/sparse/convert/csr.cuh>
#include <rmm/device_uvector.hpp>

namespace raft::bench::sparse {

template <typename index_t>
struct bench_param {
index_t num_cols;
index_t num_rows;
index_t divisor;
};

template <typename index_t>
__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
{
index_t r = blockDim.y * blockIdx.y + threadIdx.y;
index_t c = blockDim.x * blockIdx.x + threadIdx.x;

for (; r < num_rows; r += gridDim.y * blockDim.y) {
for (; c < num_cols; c += gridDim.x * blockDim.x) {
adj[r * num_cols + c] = c % divisor == 0;
}
}
}

template <typename index_t>
void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
{
// adj matrix: element a_ij is set to one if j is divisible by divisor.
dim3 block(32, 32);
const index_t max_y_grid_dim = 65535;
dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
RAFT_CHECK_CUDA(stream);
}

template <typename index_t>
struct bench_base : public fixture {
bench_base(const bench_param<index_t>& p)
: params(p),
handle(stream),
adj(p.num_rows * p.num_cols, stream),
row_ind(p.num_rows, stream),
row_ind_host(p.num_rows),
row_counters(p.num_rows, stream),
// col_ind is over-dimensioned because nnz is unknown at this point
col_ind(p.num_rows * p.num_cols, stream)
{
init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);

std::vector<index_t> row_ind_host(p.num_rows);
for (size_t i = 0; i < row_ind_host.size(); ++i) {
size_t nnz_per_row = raft::ceildiv(p.num_cols, p.divisor);
row_ind_host[i] = nnz_per_row * i;
}
raft::update_device(row_ind.data(), row_ind_host.data(), row_ind.size(), stream);
}

void run_benchmark(::benchmark::State& state) override
{
loop_on_state(state, [this]() {
raft::sparse::convert::adj_to_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.num_rows,
params.num_cols,
row_counters.data(),
col_ind.data());
});

// Estimate bandwidth:
index_t num_entries = params.num_rows * params.num_cols;
index_t bytes_read = num_entries * sizeof(bool);
index_t bytes_write = num_entries / params.divisor * sizeof(index_t);

state.counters["BW"] = benchmark::Counter(bytes_read + bytes_write,
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
state.counters["BW read"] = benchmark::Counter(
bytes_read, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
state.counters["BW write"] = benchmark::Counter(bytes_write,
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);

state.counters["Fraction nz"] = benchmark::Counter(100.0 / ((double)params.divisor));
state.counters["Columns"] = benchmark::Counter(params.num_cols);
state.counters["Rows"] = benchmark::Counter(params.num_rows);
}

protected:
raft::handle_t handle;
bench_param<index_t> params;
rmm::device_uvector<bool> adj;
rmm::device_uvector<index_t> row_ind;
std::vector<index_t> row_ind_host;
rmm::device_uvector<index_t> row_counters;
rmm::device_uvector<index_t> col_ind;
}; // struct bench_base

const int64_t num_cols = 1 << 30;

const std::vector<bench_param<int64_t>> bench_params = {
{num_cols, 1, 8},
{num_cols >> 3, 1 << 3, 8},
{num_cols >> 6, 1 << 6, 8},

{num_cols, 1, 64},
{num_cols >> 3, 1 << 3, 64},
{num_cols >> 6, 1 << 6, 64},

{num_cols, 1, 2048},
{num_cols >> 3, 1 << 3, 2048},
{num_cols >> 6, 1 << 6, 2048},
};

RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);

} // namespace raft::bench::sparse
5 changes: 2 additions & 3 deletions cpp/doxygen/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -844,9 +844,8 @@ EXCLUDE_PATTERNS = */detail/* \
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*

EXCLUDE_SYMBOLS = detail \
csr_adj_graph \
csr_adj_graph_batched
EXCLUDE_SYMBOLS = detail


# The EXAMPLE_PATH tag can be used to specify one or more files or directories
# that contain example code fragments that are included (see the \include
Expand Down
32 changes: 4 additions & 28 deletions cpp/include/raft/sparse/convert/coo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,10 @@
* Please use the cuh version instead.
*/

#ifndef __COO_H
#define __COO_H

#pragma once

#include <raft/sparse/convert/detail/coo.cuh>

namespace raft {
namespace sparse {
namespace convert {

/**
* @brief Convert a CSR row_ind array to a COO rows array
* @param row_ind: Input CSR row_ind array
* @param m: size of row_ind array
* @param coo_rows: Output COO row array
* @param nnz: size of output COO row array
* @param stream: cuda stream to use
*/
template <typename value_idx = int>
void csr_to_coo(
const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
{
detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
}

}; // end NAMESPACE convert
}; // end NAMESPACE sparse
}; // end NAMESPACE raft
#pragma message(__FILE__ \
" is deprecated and will be removed in a future release." \
" Please use the cuh version instead.")

#endif
#include "coo.cuh"
105 changes: 36 additions & 69 deletions cpp/include/raft/sparse/convert/csr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#pragma once

#include <raft/sparse/convert/detail/adj_to_csr.cuh>
#include <raft/sparse/convert/detail/csr.cuh>
#include <raft/sparse/csr.hpp>

Expand All @@ -39,74 +40,6 @@ void coo_to_csr(const raft::handle_t& handle,
detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
}

/**
* @brief Constructs an adjacency graph CSR row_ind_ptr array from
* a row_ind array and adjacency array.
* @tparam T the numeric type of the index arrays
* @tparam TPB_X the number of threads to use per block for kernels
* @tparam Lambda function for fused operation in the adj_graph construction
* @param row_ind the input CSR row_ind array
* @param total_rows number of vertices in graph
* @param nnz number of non-zeros
* @param batchSize number of vertices in current batch
* @param adj an adjacency array (size batchSize x total_rows)
* @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
* @param stream cuda stream to use
* @param fused_op: the fused operation
*/
template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph_batched(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
Index_ batchSize,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream,
Lambda fused_op)
{
detail::csr_adj_graph_batched<Index_, 32, Lambda>(
row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
}

template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph_batched(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
Index_ batchSize,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream)
{
detail::csr_adj_graph_batched<Index_, 32, Lambda>(
row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
}

/**
* @brief Constructs an adjacency graph CSR row_ind_ptr array from a
* a row_ind array and adjacency array.
* @tparam T the numeric type of the index arrays
* @tparam TPB_X the number of threads to use per block for kernels
* @param row_ind the input CSR row_ind array
* @param total_rows number of total vertices in graph
* @param nnz number of non-zeros
* @param adj an adjacency array
* @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
* @param stream cuda stream to use
* @param fused_op the fused operation
*/
template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream,
Lambda fused_op)
{
detail::csr_adj_graph<Index_, 32, Lambda>(
row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
}

/**
* @brief Generate the row indices array for a sorted COO matrix
*
Expand Down Expand Up @@ -135,8 +68,42 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
}

/**
* @brief Converts a boolean adjacency matrix into unsorted CSR format.
*
* The conversion supports non-square matrices.
*
* @tparam index_t Indexing arithmetic type
*
* @param[in] handle RAFT handle
* @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major
* format.
* @param[in] row_ind An array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* adj.
* @param[in] num_rows Number of rows of adj.
* @param[in] num_cols Number of columns of adj.
* @param tmp A pre-allocated array of size num_rows.
* @param[out] out_col_ind An array containing the column indices of the
* non-zero values in adj. Size should be at least the
* number of non-zeros in adj.
*/
template <typename index_t = int>
void adj_to_csr(const raft::handle_t& handle,
const bool* adj, // Row-major adjacency matrix
const index_t* row_ind, // Precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements.
index_t* out_col_ind // Output column indices
)
{
detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
}

}; // end NAMESPACE convert
}; // end NAMESPACE sparse
}; // end NAMESPACE raft

#endif
#endif
Loading

0 comments on commit 362f91c

Please sign in to comment.