Skip to content

Commit

Permalink
Merge branch 'branch-22.08' into fea-knn-ivf-flat
Browse files Browse the repository at this point in the history
  • Loading branch information
achirkin committed Jul 26, 2022
2 parents 196b83f + af4f35c commit e6a815b
Show file tree
Hide file tree
Showing 16 changed files with 481 additions and 423 deletions.
3 changes: 1 addition & 2 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ function sed_runner() {
sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
sed_runner 's/'"pylibraft_version .*)"'/'"pylibraft_version ${NEXT_FULL_TAG})"'/g' python/pylibraft/CMakeLists.txt
sed_runner 's/'"pyraft_version .*)"'/'"pyraft_version ${NEXT_FULL_TAG})"'/g' python/raft/CMakeLists.txt
sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' python/pylibraft/CMakeLists.txt
sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake

# Docs update
sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
Expand Down
4 changes: 1 addition & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ set(RAPIDS_VERSION "22.06")
set(RAFT_VERSION "${RAPIDS_VERSION}.00")

cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake
${CMAKE_BINARY_DIR}/RAPIDS.cmake)
include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
include(../fetch_rapids.cmake)
include(rapids-cmake)
include(rapids-cpm)
include(rapids-cuda)
Expand Down
1 change: 1 addition & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
bench/random/make_blobs.cu
bench/random/permute.cu
bench/random/rng.cu
bench/sparse/convert_csr.cu
bench/spatial/fused_l2_nn.cu
bench/spatial/knn.cu
bench/spatial/selection.cu
Expand Down
139 changes: 139 additions & 0 deletions cpp/bench/sparse/convert_csr.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cooperative_groups.h>
#include <cooperative_groups/scan.h>
#include <stdio.h>

#include <common/benchmark.hpp>
#include <raft/sparse/convert/csr.cuh>
#include <rmm/device_uvector.hpp>

namespace raft::bench::sparse {

template <typename index_t>
struct bench_param {
index_t num_cols;
index_t num_rows;
index_t divisor;
};

template <typename index_t>
__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
{
index_t r = blockDim.y * blockIdx.y + threadIdx.y;
index_t c = blockDim.x * blockIdx.x + threadIdx.x;

for (; r < num_rows; r += gridDim.y * blockDim.y) {
for (; c < num_cols; c += gridDim.x * blockDim.x) {
adj[r * num_cols + c] = c % divisor == 0;
}
}
}

template <typename index_t>
void init_adj(bool* adj, index_t num_rows, index_t num_cols, index_t divisor, cudaStream_t stream)
{
// adj matrix: element a_ij is set to one if j is divisible by divisor.
dim3 block(32, 32);
const index_t max_y_grid_dim = 65535;
dim3 grid(num_cols / 32 + 1, (int)min(num_rows / 32 + 1, max_y_grid_dim));
init_adj_kernel<index_t><<<grid, block, 0, stream>>>(adj, num_rows, num_cols, divisor);
RAFT_CHECK_CUDA(stream);
}

template <typename index_t>
struct bench_base : public fixture {
bench_base(const bench_param<index_t>& p)
: params(p),
handle(stream),
adj(p.num_rows * p.num_cols, stream),
row_ind(p.num_rows, stream),
row_ind_host(p.num_rows),
row_counters(p.num_rows, stream),
// col_ind is over-dimensioned because nnz is unknown at this point
col_ind(p.num_rows * p.num_cols, stream)
{
init_adj(adj.data(), p.num_rows, p.num_cols, p.divisor, stream);

std::vector<index_t> row_ind_host(p.num_rows);
for (size_t i = 0; i < row_ind_host.size(); ++i) {
size_t nnz_per_row = raft::ceildiv(p.num_cols, p.divisor);
row_ind_host[i] = nnz_per_row * i;
}
raft::update_device(row_ind.data(), row_ind_host.data(), row_ind.size(), stream);
}

void run_benchmark(::benchmark::State& state) override
{
loop_on_state(state, [this]() {
raft::sparse::convert::adj_to_csr<index_t>(handle,
adj.data(),
row_ind.data(),
params.num_rows,
params.num_cols,
row_counters.data(),
col_ind.data());
});

// Estimate bandwidth:
index_t num_entries = params.num_rows * params.num_cols;
index_t bytes_read = num_entries * sizeof(bool);
index_t bytes_write = num_entries / params.divisor * sizeof(index_t);

state.counters["BW"] = benchmark::Counter(bytes_read + bytes_write,
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
state.counters["BW read"] = benchmark::Counter(
bytes_read, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
state.counters["BW write"] = benchmark::Counter(bytes_write,
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);

state.counters["Fraction nz"] = benchmark::Counter(100.0 / ((double)params.divisor));
state.counters["Columns"] = benchmark::Counter(params.num_cols);
state.counters["Rows"] = benchmark::Counter(params.num_rows);
}

protected:
raft::handle_t handle;
bench_param<index_t> params;
rmm::device_uvector<bool> adj;
rmm::device_uvector<index_t> row_ind;
std::vector<index_t> row_ind_host;
rmm::device_uvector<index_t> row_counters;
rmm::device_uvector<index_t> col_ind;
}; // struct bench_base

const int64_t num_cols = 1 << 30;

const std::vector<bench_param<int64_t>> bench_params = {
{num_cols, 1, 8},
{num_cols >> 3, 1 << 3, 8},
{num_cols >> 6, 1 << 6, 8},

{num_cols, 1, 64},
{num_cols >> 3, 1 << 3, 64},
{num_cols >> 6, 1 << 6, 64},

{num_cols, 1, 2048},
{num_cols >> 3, 1 << 3, 2048},
{num_cols >> 6, 1 << 6, 2048},
};

RAFT_BENCH_REGISTER(bench_base<int64_t>, "", bench_params);

} // namespace raft::bench::sparse
5 changes: 2 additions & 3 deletions cpp/doxygen/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -844,9 +844,8 @@ EXCLUDE_PATTERNS = */detail/* \
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*

EXCLUDE_SYMBOLS = detail \
csr_adj_graph \
csr_adj_graph_batched
EXCLUDE_SYMBOLS = detail


# The EXAMPLE_PATH tag can be used to specify one or more files or directories
# that contain example code fragments that are included (see the \include
Expand Down
14 changes: 4 additions & 10 deletions cpp/include/raft/comms/detail/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ namespace raft {
namespace comms {
namespace detail {

constexpr size_t

get_datatype_size(const datatype_t datatype)
constexpr size_t get_datatype_size(const datatype_t datatype)
{
switch (datatype) {
case datatype_t::CHAR: return sizeof(char);
Expand All @@ -82,9 +80,7 @@ get_datatype_size(const datatype_t datatype)
}
}

constexpr ncclDataType_t

get_nccl_datatype(const datatype_t datatype)
constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
{
switch (datatype) {
case datatype_t::CHAR: return ncclChar;
Expand All @@ -99,9 +95,7 @@ get_nccl_datatype(const datatype_t datatype)
}
}

constexpr ncclRedOp_t

get_nccl_op(const op_t op)
constexpr ncclRedOp_t get_nccl_op(const op_t op)
{
switch (op) {
case op_t::SUM: return ncclSum;
Expand All @@ -112,7 +106,7 @@ get_nccl_op(const op_t op)
}
}

status_t nccl_sync_stream(ncclComm_t comm, cudaStream_t stream)
inline status_t nccl_sync_stream(ncclComm_t comm, cudaStream_t stream)
{
cudaError_t cudaErr;
ncclResult_t ncclErr, ncclAsyncErr;
Expand Down
32 changes: 4 additions & 28 deletions cpp/include/raft/sparse/convert/coo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,10 @@
* Please use the cuh version instead.
*/

#ifndef __COO_H
#define __COO_H

#pragma once

#include <raft/sparse/convert/detail/coo.cuh>

namespace raft {
namespace sparse {
namespace convert {

/**
* @brief Convert a CSR row_ind array to a COO rows array
* @param row_ind: Input CSR row_ind array
* @param m: size of row_ind array
* @param coo_rows: Output COO row array
* @param nnz: size of output COO row array
* @param stream: cuda stream to use
*/
template <typename value_idx = int>
void csr_to_coo(
const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
{
detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
}

}; // end NAMESPACE convert
}; // end NAMESPACE sparse
}; // end NAMESPACE raft
#pragma message(__FILE__ \
" is deprecated and will be removed in a future release." \
" Please use the cuh version instead.")

#endif
#include "coo.cuh"
105 changes: 36 additions & 69 deletions cpp/include/raft/sparse/convert/csr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#pragma once

#include <raft/sparse/convert/detail/adj_to_csr.cuh>
#include <raft/sparse/convert/detail/csr.cuh>
#include <raft/sparse/csr.hpp>

Expand All @@ -39,74 +40,6 @@ void coo_to_csr(const raft::handle_t& handle,
detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
}

/**
* @brief Constructs an adjacency graph CSR row_ind_ptr array from
* a row_ind array and adjacency array.
* @tparam T the numeric type of the index arrays
* @tparam TPB_X the number of threads to use per block for kernels
* @tparam Lambda function for fused operation in the adj_graph construction
* @param row_ind the input CSR row_ind array
* @param total_rows number of vertices in graph
* @param nnz number of non-zeros
* @param batchSize number of vertices in current batch
* @param adj an adjacency array (size batchSize x total_rows)
* @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
* @param stream cuda stream to use
* @param fused_op: the fused operation
*/
template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph_batched(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
Index_ batchSize,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream,
Lambda fused_op)
{
detail::csr_adj_graph_batched<Index_, 32, Lambda>(
row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
}

template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph_batched(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
Index_ batchSize,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream)
{
detail::csr_adj_graph_batched<Index_, 32, Lambda>(
row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
}

/**
* @brief Constructs an adjacency graph CSR row_ind_ptr array from a
* a row_ind array and adjacency array.
* @tparam T the numeric type of the index arrays
* @tparam TPB_X the number of threads to use per block for kernels
* @param row_ind the input CSR row_ind array
* @param total_rows number of total vertices in graph
* @param nnz number of non-zeros
* @param adj an adjacency array
* @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
* @param stream cuda stream to use
* @param fused_op the fused operation
*/
template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
void csr_adj_graph(const Index_* row_ind,
Index_ total_rows,
Index_ nnz,
const bool* adj,
Index_* row_ind_ptr,
cudaStream_t stream,
Lambda fused_op)
{
detail::csr_adj_graph<Index_, 32, Lambda>(
row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
}

/**
* @brief Generate the row indices array for a sorted COO matrix
*
Expand Down Expand Up @@ -135,8 +68,42 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
}

/**
* @brief Converts a boolean adjacency matrix into unsorted CSR format.
*
* The conversion supports non-square matrices.
*
* @tparam index_t Indexing arithmetic type
*
* @param[in] handle RAFT handle
* @param[in] adj A num_rows x num_cols boolean matrix in contiguous row-major
* format.
* @param[in] row_ind An array of length num_rows that indicates at which index
* a row starts in out_col_ind. Equivalently, it is the
* exclusive scan of the number of non-zeros in each row of
* adj.
* @param[in] num_rows Number of rows of adj.
* @param[in] num_cols Number of columns of adj.
* @param tmp A pre-allocated array of size num_rows.
* @param[out] out_col_ind An array containing the column indices of the
* non-zero values in adj. Size should be at least the
* number of non-zeros in adj.
*/
template <typename index_t = int>
void adj_to_csr(const raft::handle_t& handle,
const bool* adj, // Row-major adjacency matrix
const index_t* row_ind, // Precomputed row indices
index_t num_rows, // # rows of adj
index_t num_cols, // # cols of adj
index_t* tmp, // Pre-allocated atomic counters. Minimum size: num_rows elements.
index_t* out_col_ind // Output column indices
)
{
detail::adj_to_csr(handle, adj, row_ind, num_rows, num_cols, tmp, out_col_ind);
}

}; // end NAMESPACE convert
}; // end NAMESPACE sparse
}; // end NAMESPACE raft

#endif
#endif
Loading

0 comments on commit e6a815b

Please sign in to comment.