Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace csr_adj_graph functions with faster equivalent #746

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
bench/random/make_blobs.cu
bench/random/permute.cu
bench/random/rng.cu
bench/sparse/convert_csr.cu
bench/spatial/fused_l2_nn.cu
bench/spatial/knn.cu
bench/spatial/selection.cu
Expand Down
139 changes: 139 additions & 0 deletions cpp/bench/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cooperative_groups.h>
#include <cooperative_groups/scan.h>
#include <stdio.h>

#include <common/benchmark.hpp>
#include <raft/sparse/convert/csr.cuh>
#include <rmm/device_uvector.hpp>

namespace raft::bench::sparse {

template <typename index_t>
struct bench_param {
index_t num_cols;
index_t num_rows;
index_t divisor;
};

template <typename index_t>
__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
{
index_t r = blockDim.y * blockIdx.y + threadIdx.y;
ahendriksen marked this conversation as resolved.
Show resolved Hide resolved
index_t c = blockDim.x * blockIdx.x + threadIdx.x;

for (; r < num_rows; r += gridDim.y * blockDim.y) {
for (; c < num_cols; c += gridDim.x * blockDim.x) {
adj[r * num_cols + c] = c % divisor == 0;
}
}
}

template <typename index_t>
void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
{
// adj matrix: element a_ij is set to one if j is divisible by divisor.
dim3 block(32, 32);
const index_t max_y_grid_dim = 65535;
dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
RAFT_CHECK_CUDA(stream);
}

template <typename index_t>
struct bench_base : public fixture {
bench_base(const bench_param<index_t>& p)
: params(p),
handle(stream),
adj(p.num_rows * p.num_cols, stream),
row_ind(p.num_rows, stream),
row_ind_host(p.num_rows),
row_counters(p.num_rows, stream),
// col_ind is over-dimensioned because nnz is unknown at this point
col_ind(p.num_rows * p.num_cols, stream)
{
init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);

std::vector<index_t> row_ind_host(p.num_rows);
for (size_t i = 0; i < row_ind_host.size(); ++i) {
size_t nnz_per_row = raft::ceildiv(p.num_cols, p.divisor);
row_ind_host[i] = nnz_per_row * i;
}
raft::update_device(row_ind.data(), row_ind_host.data(), row_ind.size(), stream);
}

void run_benchmark(::benchmark::State& state) override
{
loop_on_state(state, [this]() {
raft::sparse::convert::adj_to_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.num_rows,
params.num_cols,
row_counters.data(),
col_ind.data());
});

// Estimate bandwidth:
index_t num_entries = params.num_rows * params.num_cols;
index_t bytes_read = num_entries * sizeof(bool);
index_t bytes_write = num_entries / params.divisor * sizeof(index_t);

state.counters["BW"] = benchmark::Counter(bytes_read + bytes_write,
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
state.counters["BW read"] = benchmark::Counter(
bytes_read, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
state.counters["BW write"] = benchmark::Counter(bytes_write,
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);

state.counters["Fraction nz"] = benchmark::Counter(100.0 / ((double)params.divisor));
state.counters["Columns"] = benchmark::Counter(params.num_cols);
state.counters["Rows"] = benchmark::Counter(params.num_rows);
}

protected:
raft::handle_t handle;
bench_param<index_t> params;
rmm::device_uvector<bool> adj;
rmm::device_uvector<index_t> row_ind;
std::vector<index_t> row_ind_host;
rmm::device_uvector<index_t> row_counters;
rmm::device_uvector<index_t> col_ind;
}; // struct bench_base

const int64_t num_cols = 1 << 30;

const std::vector<bench_param<int64_t>> bench_params = {
{num_cols, 1, 8},
{num_cols >> 3, 1 << 3, 8},
{num_cols >> 6, 1 << 6, 8},

{num_cols, 1, 64},
{num_cols >> 3, 1 << 3, 64},
{num_cols >> 6, 1 << 6, 64},

{num_cols, 1, 2048},
{num_cols >> 3, 1 << 3, 2048},
{num_cols >> 6, 1 << 6, 2048},
};

RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);

} // namespace raft::bench::sparse
5 changes: 2 additions & 3 deletions cpp/doxygen/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -844,9 +844,8 @@ EXCLUDE_PATTERNS = */detail/* \
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*

EXCLUDE_SYMBOLS = detail \
csr_adj_graph \
csr_adj_graph_batched
EXCLUDE_SYMBOLS = detail


# The EXAMPLE_PATH tag can be used to specify one or more files or directories
# that contain example code fragments that are included (see the \include
Expand Down
32 changes: 4 additions & 28 deletions cpp/include/raft/sparse/convert/coo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,10 @@
* Please use the cuh version instead.
*/

#ifndef __COO_H
#define __COO_H

#pragma once

#include <raft/sparse/convert/detail/coo.cuh>

namespace raft {
namespace sparse {
namespace convert {

/**
* @brief Convert a CSR row_ind array to a COO rows array
* @param row_ind: Input CSR row_ind array
* @param m: size of row_ind array
* @param coo_rows: Output COO row array
* @param nnz: size of output COO row array
* @param stream: cuda stream to use
*/
template <typename value_idx = int>
void csr_to_coo(
const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
{
detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
}

}; // end NAMESPACE convert
}; // end NAMESPACE sparse
}; // end NAMESPACE raft
#pragma message(__FILE__ \
" is deprecated and will be removed in a future release." \
" Please use the cuh version instead.")

#endif
#include "coo.cuh"
105 changes: 36 additions & 69 deletions cpp/include/raft/sparse/convert/csr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#pragma once

#include <raft/sparse/convert/detail/adj_to_csr.cuh>
#include <raft/sparse/convert/detail/csr.cuh>
#include <raft/sparse/csr.hpp>

Expand All @@ -39,74 +40,6 @@ void coo_to_csr(const raft::handle_t& handle,
detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
}

/**
* @brief Constructs an adjacency graph CSR row_ind_ptr array from
* a row_ind array and adjacency array.
* @tparam T the numeric type of the index arrays
* @tparam TPB_X the number of threads to use per block for kernels
* @tparam Lambda function for fused operation in the adj_graph construction
* @param row_ind the input CSR row_ind array
* @param total_rows number of vertices in graph
* @param nnz number of non-zeros
* @param batchSize number of vertices in current batch
* @param adj an adjacency array (size batchSize x total_rows)
* @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
* @param stream cuda stream to use
* @param fused_op: the fused operation
*/
template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph_batched(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
Index_ batchSize,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream,
Lambda fused_op)
{
detail::csr_adj_graph_batched<Index_, 32, Lambda>(
row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
}

template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph_batched(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
Index_ batchSize,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream)
{
detail::csr_adj_graph_batched<Index_, 32, Lambda>(
row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
}

/**
* @brief Constructs an adjacency graph CSR row_ind_ptr array from a
* a row_ind array and adjacency array.
* @tparam T the numeric type of the index arrays
* @tparam TPB_X the number of threads to use per block for kernels
* @param row_ind the input CSR row_ind array
* @param total_rows number of total vertices in graph
* @param nnz number of non-zeros
* @param adj an adjacency array
* @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
* @param stream cuda stream to use
* @param fused_op the fused operation
*/
template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream,
Lambda fused_op)
{
detail::csr_adj_graph<Index_, 32, Lambda>(
row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
}

/**
* @brief Generate the row indices array for a sorted COO matrix
*
Expand Down Expand Up @@ -135,8 +68,42 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
}

/**
* @brief Converts a boolean adjacency matrix into unsorted CSR format.
*
* The conversion supports non-square matrices.
*
* @tparam index_t Indexing arithmetic type
*
* @param[in] handle RAFT handle
* @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major
* format.
* @param[in] row_ind An array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* adj.
* @param[in] num_rows Number of rows of adj.
* @param[in] num_cols Number of columns of adj.
* @param tmp A pre-allocated array of size num_rows.
* @param[out] out_col_ind An array containing the column indices of the
* non-zero values in adj. Size should be at least the
* number of non-zeros in adj.
*/
template <typename index_t = int>
void adj_to_csr(const raft::handle_t& handle,
const bool* adj, // Row-major adjacency matrix
const index_t* row_ind, // Precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements.
index_t* out_col_ind // Output column indices
)
{
detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
}

}; // end NAMESPACE convert
}; // end NAMESPACE sparse
}; // end NAMESPACE raft

#endif
#endif
Loading