From 6a7894fd62ae600064ff41620ea7c233ed09070b Mon Sep 17 00:00:00 2001 From: afender Date: Tue, 9 Feb 2021 17:57:55 -0600 Subject: [PATCH 01/11] get_handle_from_internal_pool --- cpp/include/raft/handle.hpp | 23 ++++++++++++++++++++++- cpp/test/handle.cpp | 11 +++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index af53968653..f38aec394c 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -67,9 +67,22 @@ class handle_t { host_allocator_(std::make_shared()) { create_resources(); } + handle_t(const handle_t& h) : dev_id_(h.get_device()), num_streams_(0) {} + handle_t(const handle_t&& h) : dev_id_(h.get_device()), num_streams_(0) {} + + handle_t& operator=(const handle_t& h) { + prop_ = h.get_device_properties(); + device_prop_initialized_ = true; + device_allocator_ = get_device_allocator(); + host_allocator_ = get_host_allocator(); + return *this; + } /** Destroys all held-up resources */ - virtual ~handle_t() { destroy_resources(); } + virtual ~handle_t() { + std::cout << "dtor" << std::endl; + destroy_resources(); + } int get_device() const { return dev_id_; } @@ -136,6 +149,14 @@ class handle_t { return int_streams_vec; } + handle_t get_handle_from_internal_pool( + int stream_id, int n_streams = kNumDefaultWorkerStreams) const { + handle_t handle(n_streams); + handle = *this; + handle.set_stream(this->get_internal_stream(stream_id)); + return handle; + } + void wait_on_user_stream() const { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (auto s : streams_) { diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 5f6f3ceece..4c8b327e76 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -49,4 +50,14 @@ TEST(Raft, GetInternalStreams) { ASSERT_EQ(4U, streams.size()); } +TEST(Raft, GetHandleFromPool) { + handle_t parent(4); + int sid = 2; + auto child = parent.get_handle_from_internal_pool(sid); + std::cout << "done" << std::endl; + + ASSERT_EQ(parent.get_internal_stream(sid), child.get_stream()); + ASSERT_EQ(0, child.get_num_internal_streams()); + ASSERT_EQ(parent.get_device(), child.get_device()); +} } // namespace raft From d88bb146b71251a845b960db56958fca6c5855b7 Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 10 Feb 2021 16:16:58 -0600 Subject: [PATCH 02/11] added rmm stream pool as backend --- cpp/include/raft/handle.hpp | 43 ++++++++++++++----------------------- cpp/test/handle.cpp | 2 -- 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index f38aec394c..8b2aa58611 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -38,6 +38,7 @@ #include #include #include +#include #include "cudart_utils.h" namespace raft { @@ -62,13 +63,13 @@ class handle_t { CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; }()), - num_streams_(n_streams), + streams_(n_streams), device_allocator_(std::make_shared()), host_allocator_(std::make_shared()) { create_resources(); } - handle_t(const handle_t& h) : dev_id_(h.get_device()), num_streams_(0) {} - handle_t(const handle_t&& h) : dev_id_(h.get_device()), num_streams_(0) {} + handle_t(const handle_t& h) : dev_id_(h.get_device()) {} + handle_t(const handle_t&& h) : dev_id_(h.get_device()) {} handle_t& operator=(const handle_t& h) { prop_ = h.get_device_properties(); @@ -79,10 +80,7 @@ class handle_t { } /** Destroys all held-up resources */ - virtual ~handle_t() { - std::cout << "dtor" << std::endl; - destroy_resources(); - } + virtual ~handle_t() { destroy_resources(); } int get_device() const { return dev_id_; } @@ -139,12 +137,14 @@ class handle_t { return cusparse_handle_; } - cudaStream_t get_internal_stream(int sid) const { return streams_[sid]; } - int get_num_internal_streams() const { return num_streams_; } + cudaStream_t get_internal_stream(int sid) const { + return streams_.get_stream(sid).value(); + } + int get_num_internal_streams() const { return streams_.get_pool_size(); } std::vector get_internal_streams() const { std::vector int_streams_vec; - for (auto s : streams_) { - int_streams_vec.push_back(s); + for (int i = 0; i < get_num_internal_streams(); i++) { + int_streams_vec.push_back(get_internal_stream(i)); } return int_streams_vec; } @@ -159,14 +159,14 @@ class handle_t { void wait_on_user_stream() const { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); - for (auto s : streams_) { - CUDA_CHECK(cudaStreamWaitEvent(s, event_, 0)); + for (int i = 0; i < get_num_internal_streams(); i++) { + CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } void wait_on_internal_streams() const { - for (auto s : streams_) { - CUDA_CHECK(cudaEventRecord(event_, s)); + for (int i = 0; i < get_num_internal_streams(); i++) { + CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); } } @@ -213,8 +213,7 @@ class handle_t { std::unordered_map> subcomms_; const int dev_id_; - const int num_streams_; - std::vector streams_; + rmm::cuda_stream_pool streams_{0}; mutable cublasHandle_t cublas_handle_; mutable bool cublas_initialized_{false}; mutable cusolverDnHandle_t cusolver_dn_handle_; @@ -232,11 +231,6 @@ class handle_t { mutable std::mutex mutex_; void create_resources() { - for (int i = 0; i < num_streams_; ++i) { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - streams_.push_back(stream); - } CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } @@ -258,11 +252,6 @@ class handle_t { //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - while (!streams_.empty()) { - //CUDA_CHECK_NO_THROW(cudaStreamDestroy(streams_.back())); - CUDA_CHECK(cudaStreamDestroy(streams_.back())); - streams_.pop_back(); - } //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); } diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 4c8b327e76..ee6d6d2a48 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -54,8 +54,6 @@ TEST(Raft, GetHandleFromPool) { handle_t parent(4); int sid = 2; auto child = parent.get_handle_from_internal_pool(sid); - std::cout << "done" << std::endl; - ASSERT_EQ(parent.get_internal_stream(sid), child.get_stream()); ASSERT_EQ(0, child.get_num_internal_streams()); ASSERT_EQ(parent.get_device(), child.get_device()); From cf92c412371166cacd1ef262a0556d3df717581f Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 10 Feb 2021 16:27:31 -0600 Subject: [PATCH 03/11] added rmm stream pool as backend --- cpp/include/raft/handle.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 8b2aa58611..a42fdd67b2 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -86,6 +86,9 @@ class handle_t { void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } + rmm::cuda_stream_view get_stream_view() const { + return rmm::cuda_stream_view(user_stream_); + } void set_device_allocator(std::shared_ptr allocator) { device_allocator_ = allocator; @@ -137,9 +140,15 @@ class handle_t { return cusparse_handle_; } + // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); } + // new accessor return rmm::cuda_stream_view + rmm::cuda_stream_view get_internal_stream_view(int sid) const { + return streams_.get_stream(sid); + } + int get_num_internal_streams() const { return streams_.get_pool_size(); } std::vector get_internal_streams() const { std::vector int_streams_vec; From 4cebf2453eec8c48fadf5ff5cb12a4e21e914509 Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 10 Feb 2021 16:54:23 -0600 Subject: [PATCH 04/11] exposed rmm::cuda_stream_view for streams access --- cpp/test/handle.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index ee6d6d2a48..8fef4ead61 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -52,10 +52,25 @@ TEST(Raft, GetInternalStreams) { TEST(Raft, GetHandleFromPool) { handle_t parent(4); - int sid = 2; - auto child = parent.get_handle_from_internal_pool(sid); - ASSERT_EQ(parent.get_internal_stream(sid), child.get_stream()); + + auto child = parent.get_handle_from_internal_pool(2); + ASSERT_EQ(parent.get_internal_stream(2), child.get_stream()); ASSERT_EQ(0, child.get_num_internal_streams()); + + child.set_stream(parent.get_internal_stream(3)); + ASSERT_EQ(parent.get_internal_stream(3), child.get_stream()); + ASSERT_NE(parent.get_internal_stream(2), child.get_stream()); + ASSERT_EQ(parent.get_device(), child.get_device()); } + +TEST(Raft, GetHandleStreamViews) { + handle_t parent(4); + + auto child = parent.get_handle_from_internal_pool(2); + ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); + ASSERT_EQ(parent.get_internal_stream_view(2).value(), + child.get_stream_view().value()); + EXPECT_FALSE(child.get_stream_view().is_default()); +} } // namespace raft From 2c38896e63e5c726108385012b71c0b1b6beec80 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 11 Feb 2021 15:14:10 -0500 Subject: [PATCH 05/11] Moving cuml sparse prims to raft (#139) Moving cuml sparse prims and gtests to raft. The namespaces have already been adjusted in cuml so this is a simple move for the most part. There are a few places where includes needed to be updated and I Needed to remove cuml debug logs. Authors: - Corey J. Nolet (@cjnolet) Approvers: - Divye Gala (@divyegala) - Dante Gama Dessavre (@dantegd) URL: https://github.com/rapidsai/raft/pull/139 --- cpp/CMakeLists.txt | 19 +- cpp/include/raft/sparse/convert/coo.cuh | 74 ++ cpp/include/raft/sparse/convert/csr.cuh | 189 +++++ cpp/include/raft/sparse/convert/dense.cuh | 110 +++ cpp/include/raft/sparse/coo.cuh | 259 ++++++ cpp/include/raft/sparse/csr.cuh | 263 ++++++ .../raft/sparse/distance/bin_distance.cuh | 195 +++++ cpp/include/raft/sparse/distance/common.h | 59 ++ cpp/include/raft/sparse/distance/coo_spmv.cuh | 350 ++++++++ cpp/include/raft/sparse/distance/csr_spmv.cuh | 484 +++++++++++ cpp/include/raft/sparse/distance/distance.cuh | 104 +++ .../raft/sparse/distance/ip_distance.cuh | 329 ++++++++ .../raft/sparse/distance/l2_distance.cuh | 262 ++++++ .../raft/sparse/distance/lp_distance.cuh | 196 +++++ .../raft/sparse/distance/operators.cuh | 88 ++ cpp/include/raft/sparse/linalg/add.cuh | 226 ++++++ cpp/include/raft/sparse/linalg/degree.cuh | 184 +++++ cpp/include/raft/sparse/linalg/norm.cuh | 169 ++++ cpp/include/raft/sparse/linalg/spectral.cuh | 103 +++ cpp/include/raft/sparse/linalg/symmetrize.cuh | 309 +++++++ cpp/include/raft/sparse/linalg/transpose.h | 87 ++ cpp/include/raft/sparse/op/filter.cuh | 201 +++++ cpp/include/raft/sparse/op/row_op.cuh | 76 ++ cpp/include/raft/sparse/op/slice.h | 99 +++ cpp/include/raft/sparse/op/sort.h | 105 +++ cpp/include/raft/sparse/selection/knn.cuh | 483 +++++++++++ .../raft/sparse/selection/selection.cuh | 157 ++++ cpp/include/raft/sparse/utils.h | 114 +++ cpp/include/raft/spatial/knn/knn.hpp | 2 +- cpp/test/sparse/add.cu | 174 ++++ cpp/test/sparse/convert_coo.cu | 98 +++ cpp/test/sparse/convert_csr.cu | 180 +++++ cpp/test/sparse/csr_row_slice.cu | 184 +++++ cpp/test/sparse/csr_to_dense.cu | 140 ++++ cpp/test/sparse/csr_transpose.cu | 174 ++++ cpp/test/sparse/degree.cu | 110 +++ cpp/test/sparse/dist_coo_spmv.cu | 628 ++++++++++++++ cpp/test/sparse/dist_csr_spmv.cu | 608 ++++++++++++++ cpp/test/sparse/distance.cu | 764 ++++++++++++++++++ cpp/test/sparse/filter.cu | 122 +++ cpp/test/sparse/knn.cu | 192 +++++ cpp/test/sparse/norm.cu | 127 +++ cpp/test/sparse/row_op.cu | 111 +++ cpp/test/sparse/selection.cu | 157 ++++ cpp/test/sparse/sort.cu | 103 +++ cpp/test/sparse/symmetrize.cu | 111 +++ 46 files changed, 9277 insertions(+), 2 deletions(-) create mode 100644 cpp/include/raft/sparse/convert/coo.cuh create mode 100644 cpp/include/raft/sparse/convert/csr.cuh create mode 100644 cpp/include/raft/sparse/convert/dense.cuh create mode 100644 cpp/include/raft/sparse/coo.cuh create mode 100644 cpp/include/raft/sparse/csr.cuh create mode 100644 cpp/include/raft/sparse/distance/bin_distance.cuh create mode 100644 cpp/include/raft/sparse/distance/common.h create mode 100644 cpp/include/raft/sparse/distance/coo_spmv.cuh create mode 100644 cpp/include/raft/sparse/distance/csr_spmv.cuh create mode 100644 cpp/include/raft/sparse/distance/distance.cuh create mode 100644 cpp/include/raft/sparse/distance/ip_distance.cuh create mode 100644 cpp/include/raft/sparse/distance/l2_distance.cuh create mode 100644 cpp/include/raft/sparse/distance/lp_distance.cuh create mode 100644 cpp/include/raft/sparse/distance/operators.cuh create mode 100644 cpp/include/raft/sparse/linalg/add.cuh create mode 100644 cpp/include/raft/sparse/linalg/degree.cuh create mode 100644 cpp/include/raft/sparse/linalg/norm.cuh create mode 100644 cpp/include/raft/sparse/linalg/spectral.cuh create mode 100644 cpp/include/raft/sparse/linalg/symmetrize.cuh create mode 100644 cpp/include/raft/sparse/linalg/transpose.h create mode 100644 cpp/include/raft/sparse/op/filter.cuh create mode 100644 cpp/include/raft/sparse/op/row_op.cuh create mode 100644 cpp/include/raft/sparse/op/slice.h create mode 100644 cpp/include/raft/sparse/op/sort.h create mode 100644 cpp/include/raft/sparse/selection/knn.cuh create mode 100644 cpp/include/raft/sparse/selection/selection.cuh create mode 100644 cpp/include/raft/sparse/utils.h create mode 100644 cpp/test/sparse/add.cu create mode 100644 cpp/test/sparse/convert_coo.cu create mode 100644 cpp/test/sparse/convert_csr.cu create mode 100644 cpp/test/sparse/csr_row_slice.cu create mode 100644 cpp/test/sparse/csr_to_dense.cu create mode 100644 cpp/test/sparse/csr_transpose.cu create mode 100644 cpp/test/sparse/degree.cu create mode 100644 cpp/test/sparse/dist_coo_spmv.cu create mode 100644 cpp/test/sparse/dist_csr_spmv.cu create mode 100644 cpp/test/sparse/distance.cu create mode 100644 cpp/test/sparse/filter.cu create mode 100644 cpp/test/sparse/knn.cu create mode 100644 cpp/test/sparse/norm.cu create mode 100644 cpp/test/sparse/row_op.cu create mode 100644 cpp/test/sparse/selection.cu create mode 100644 cpp/test/sparse/sort.cu create mode 100644 cpp/test/sparse/symmetrize.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3baee48a5f..ae91d75b31 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -277,6 +277,23 @@ if(BUILD_RAFT_TESTS) test/random/rng.cu test/random/rng_int.cu test/random/sample_without_replacement.cu + test/sparse/add.cu + test/sparse/convert_coo.cu + test/sparse/convert_csr.cu + test/sparse/csr_row_slice.cu + test/sparse/csr_to_dense.cu + test/sparse/csr_transpose.cu + test/sparse/degree.cu + test/sparse/dist_coo_spmv.cu + test/sparse/dist_csr_spmv.cu + test/sparse/distance.cu + test/sparse/filter.cu + test/sparse/knn.cu + test/sparse/norm.cu + test/sparse/row_op.cu + test/sparse/selection.cu + test/sparse/sort.cu + test/sparse/symmetrize.cu test/spatial/knn.cu test/stats/mean.cu test/stats/mean_center.cu diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh new file mode 100644 index 0000000000..e367550060 --- /dev/null +++ b/cpp/include/raft/sparse/convert/coo.cuh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include +#include + +namespace raft { +namespace sparse { +namespace convert { + +template +__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, + value_idx *coo_rows, value_idx nnz) { + // row-based matrix 1 thread per row + value_idx row = (blockIdx.x * TPB_X) + threadIdx.x; + if (row < m) { + value_idx start_idx = row_ind[row]; + value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); + for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row; + } +} + +/** + * @brief Convert a CSR row_ind array to a COO rows array + * @param row_ind: Input CSR row_ind array + * @param m: size of row_ind array + * @param coo_rows: Output COO row array + * @param nnz: size of output COO row array + * @param stream: cuda stream to use + */ +template +void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows, + value_idx nnz, cudaStream_t stream) { + // @TODO: Use cusparse for this. + dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + csr_to_coo_kernel + <<>>(row_ind, m, coo_rows, nnz); + + CUDA_CHECK(cudaGetLastError()); +} + +}; // end NAMESPACE convert +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh new file mode 100644 index 0000000000..a034bdbda8 --- /dev/null +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace raft { +namespace sparse { +namespace convert { + +template +void coo_to_csr(const raft::handle_t &handle, const int *srcRows, + const int *srcCols, const value_t *srcVals, int nnz, int m, + int *dst_offsets, int *dstCols, value_t *dstVals) { + auto stream = handle.get_stream(); + auto cusparseHandle = handle.get_cusparse_handle(); + auto d_alloc = handle.get_device_allocator(); + raft::mr::device::buffer dstRows(d_alloc, stream, nnz); + CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, + cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, + cudaMemcpyDeviceToDevice, stream)); + auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( + cusparseHandle, m, m, nnz, srcRows, srcCols, stream); + raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); + raft::mr::device::buffer P(d_alloc, stream, nnz); + CUSPARSE_CHECK( + cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), + dstCols, P.data(), pBuffer.data(), stream); + raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), + stream); + raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, + dst_offsets, stream); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +/** + * @brief Constructs an adjacency graph CSR row_ind_ptr array from + * a row_ind array and adjacency array. + * @tparam T the numeric type of the index arrays + * @tparam TPB_X the number of threads to use per block for kernels + * @tparam Lambda function for fused operation in the adj_graph construction + * @param row_ind the input CSR row_ind array + * @param total_rows number of vertices in graph + * @param nnz number of non-zeros + * @param batchSize number of vertices in current batch + * @param adj an adjacency array (size batchSize x total_rows) + * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph + * @param stream cuda stream to use + * @param fused_op: the fused operation + */ +template void> +void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, + Index_ batchSize, const bool *adj, + Index_ *row_ind_ptr, cudaStream_t stream, + Lambda fused_op) { + op::csr_row_op( + row_ind, batchSize, nnz, + [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__( + Index_ row, Index_ start_idx, Index_ stop_idx) { + fused_op(row, start_idx, stop_idx); + Index_ k = 0; + for (Index_ i = 0; i < total_rows; i++) { + // @todo: uncoalesced mem accesses! + if (adj[batchSize * i + row]) { + row_ind_ptr[start_idx + k] = i; + k += 1; + } + } + }, + stream); +} + +template void> +void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, + Index_ batchSize, const bool *adj, + Index_ *row_ind_ptr, cudaStream_t stream) { + csr_adj_graph_batched( + row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, + [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); +} + +/** + * @brief Constructs an adjacency graph CSR row_ind_ptr array from a + * a row_ind array and adjacency array. + * @tparam T the numeric type of the index arrays + * @tparam TPB_X the number of threads to use per block for kernels + * @param row_ind the input CSR row_ind array + * @param total_rows number of total vertices in graph + * @param nnz number of non-zeros + * @param adj an adjacency array + * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph + * @param stream cuda stream to use + * @param fused_op the fused operation + */ +template void> +void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, + const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream, + Lambda fused_op) { + csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, + adj, row_ind_ptr, stream, fused_op); +} + +/** + * @brief Generate the row indices array for a sorted COO matrix + * + * @param rows: COO rows array + * @param nnz: size of COO rows array + * @param row_ind: output row indices array + * @param m: number of rows in dense matrix + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, + std::shared_ptr d_alloc, + cudaStream_t stream) { + raft::mr::device::buffer row_counts(d_alloc, stream, m); + + CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); + + linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream); + + // create csr compressed row index from row counts + thrust::device_ptr row_counts_d = + thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + c_ind_d); +} + +/** + * @brief Generate the row indices array for a sorted COO matrix + * + * @param coo: Input COO matrix + * @param row_ind: output row indices array + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void sorted_coo_to_csr(COO *coo, int *row_ind, + std::shared_ptr d_alloc, + cudaStream_t stream) { + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, + stream); +} + +}; // end NAMESPACE convert +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh new file mode 100644 index 0000000000..299f9d36d4 --- /dev/null +++ b/cpp/include/raft/sparse/convert/dense.cuh @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace convert { + +template +__global__ void csr_to_dense_warp_per_row_kernel(int n_cols, + const value_t *csrVal, + const int *csrRowPtr, + const int *csrColInd, + value_t *a) { + int row = blockIdx.x; + int tid = threadIdx.x; + + int colStart = csrRowPtr[row]; + int colEnd = csrRowPtr[row + 1]; + int rowNnz = colEnd - colStart; + + for (int i = tid; i < rowNnz; i += blockDim.x) { + int colIdx = colStart + i; + if (colIdx < colEnd) { + int col = csrColInd[colIdx]; + a[row * n_cols + col] = csrVal[colIdx]; + } + } +} + +/** + * Convert CSR arrays to a dense matrix in either row- + * or column-major format. A custom kernel is used when + * row-major output is desired since cusparse does not + * output row-major. + * @tparam value_idx : data type of the CSR index arrays + * @tparam value_t : data type of the CSR value array + * @param[in] handle : cusparse handle for conversion + * @param[in] nrows : number of rows in CSR + * @param[in] ncols : number of columns in CSR + * @param[in] csr_indptr : CSR row index pointer array + * @param[in] csr_indices : CSR column indices array + * @param[in] csr_data : CSR data array + * @param[in] lda : Leading dimension (used for col-major only) + * @param[out] out : Dense output array of size nrows * ncols + * @param[in] stream : Cuda stream for ordering events + * @param[in] row_major : Is row-major output desired? + */ +template +void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, + const value_idx *csr_indptr, const value_idx *csr_indices, + const value_t *csr_data, value_idx lda, value_t *out, + cudaStream_t stream, bool row_major = true) { + if (!row_major) { + /** + * If we need col-major, use cusparse. + */ + cusparseMatDescr_t out_mat; + CUSPARSE_CHECK(cusparseCreateMatDescr(&out_mat)); + CUSPARSE_CHECK(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + + CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( + handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, + lda, stream)); + + CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); + + } else { + int blockdim = block_dim(ncols); + CUDA_CHECK( + cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + csr_to_dense_warp_per_row_kernel<<>>( + ncols, csr_data, csr_indptr, csr_indices, out); + } +} + +}; // end NAMESPACE convert +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh new file mode 100644 index 0000000000..73120fea8c --- /dev/null +++ b/cpp/include/raft/sparse/coo.cuh @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#define restrict __restrict__ + +#pragma once + +namespace raft { +namespace sparse { + +/** @brief A Container object for sparse coordinate. There are two motivations + * behind using a container for COO arrays. + * + * The first motivation is that it simplifies code, rather than always having + * to pass three arrays as function arguments. + * + * The second is more subtle, but much more important. The size + * of the resulting COO from a sparse operation is often not known ahead of time, + * since it depends on the contents of the underlying graph. The COO object can + * allocate the underlying arrays lazily so that the object can be created by the + * user and passed as an output argument in a sparse primitive. The sparse primitive + * would have the responsibility for allocating and populating the output arrays, + * while the original caller still maintains ownership of the underlying memory. + * + * @tparam T: the type of the value array. + * @tparam Index_Type: the type of index array + * + */ +template +class COO { + protected: + raft::mr::device::buffer rows_arr; + raft::mr::device::buffer cols_arr; + raft::mr::device::buffer vals_arr; + + public: + Index_Type nnz; + Index_Type n_rows; + Index_Type n_cols; + + /** + * @param d_alloc: the device allocator to use for the underlying buffers + * @param stream: CUDA stream to use + */ + COO(std::shared_ptr d_alloc, cudaStream_t stream) + : rows_arr(d_alloc, stream, 0), + cols_arr(d_alloc, stream, 0), + vals_arr(d_alloc, stream, 0), + nnz(0), + n_rows(0), + n_cols(0) {} + + /** + * @param rows: coo rows array + * @param cols: coo cols array + * @param vals: coo vals array + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + */ + COO(raft::mr::device::buffer &rows, + raft::mr::device::buffer &cols, + raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, + Index_Type n_cols = 0) + : rows_arr(rows), + cols_arr(cols), + vals_arr(vals), + nnz(nnz), + n_rows(n_rows), + n_cols(n_cols) {} + + /** + * @param d_alloc: the device allocator use + * @param stream: CUDA stream to use + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + * @param init: initialize arrays with zeros + */ + COO(std::shared_ptr d_alloc, cudaStream_t stream, + Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, + bool init = true) + : rows_arr(d_alloc, stream, nnz), + cols_arr(d_alloc, stream, nnz), + vals_arr(d_alloc, stream, nnz), + nnz(nnz), + n_rows(n_rows), + n_cols(n_cols) { + if (init) init_arrays(stream); + } + + void init_arrays(cudaStream_t stream) { + CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, + this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, + this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK( + cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + } + + ~COO() {} + + /** + * @brief Size should be > 0, with the number of rows + * and cols in the dense matrix being > 0. + */ + bool validate_size() const { + if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false; + return true; + } + + /** + * @brief If the underlying arrays have not been set, + * return false. Otherwise true. + */ + bool validate_mem() const { + if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || + this->vals_arr.size() == 0) { + return false; + } + + return true; + } + + /* + * @brief Returns the rows array + */ + Index_Type *rows() { return this->rows_arr.data(); } + + /** + * @brief Returns the cols array + */ + Index_Type *cols() { return this->cols_arr.data(); } + + /** + * @brief Returns the vals array + */ + T *vals() { return this->vals_arr.data(); } + + /** + * @brief Send human-readable state information to output stream + */ + friend std::ostream &operator<<(std::ostream &out, + const COO &c) { + if (c.validate_size() && c.validate_mem()) { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) + << std::endl; + out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) + << std::endl; + out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) + << std::endl; + out << "nnz=" << c.nnz << std::endl; + out << "n_rows=" << c.n_rows << std::endl; + out << "n_cols=" << c.n_cols << std::endl; + + CUDA_CHECK(cudaStreamDestroy(stream)); + } else { + out << "Cannot print COO object: Uninitialized or invalid." << std::endl; + } + + return out; + } + + /** + * @brief Set the number of rows and cols + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + */ + void setSize(int n_rows, int n_cols) { + this->n_rows = n_rows; + this->n_cols = n_cols; + } + + /** + * @brief Set the number of rows and cols for a square dense matrix + * @param n: number of rows and cols + */ + void setSize(int n) { + this->n_rows = n; + this->n_cols = n; + } + + /** + * @brief Allocate the underlying arrays + * @param nnz: size of underlying row/col/val arrays + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, bool init, cudaStream_t stream) { + this->allocate(nnz, 0, init, stream); + } + + /** + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param size: the number of rows/cols in a square dense matrix + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, int size, bool init, cudaStream_t stream) { + this->allocate(nnz, size, size, init, stream); + } + + /** + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + * @param init: should values be initialized to 0? + * @param stream: stream to use for init + */ + void allocate(int nnz, int n_rows, int n_cols, bool init, + cudaStream_t stream) { + this->n_rows = n_rows; + this->n_cols = n_cols; + this->nnz = nnz; + + this->rows_arr.resize(this->nnz, stream); + this->cols_arr.resize(this->nnz, stream); + this->vals_arr.resize(this->nnz, stream); + + if (init) init_arrays(stream); + } +}; + +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh new file mode 100644 index 0000000000..bc4a68d296 --- /dev/null +++ b/cpp/include/raft/sparse/csr.cuh @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { + +//@TODO: Pull this out into a separate file + +struct WeakCCState { + public: + bool *m; + WeakCCState(bool *m) : m(m) {} +}; + +template +__global__ void weak_cc_label_device(Index_ *__restrict__ labels, + const Index_ *__restrict__ row_ind, + const Index_ *__restrict__ row_ind_ptr, + Index_ nnz, bool *__restrict__ m, + Index_ start_vertex_id, Index_ batch_size, + Index_ N, Lambda filter_op) { + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; + Index_ global_id = tid + start_vertex_id; + if (tid < batch_size && global_id < N) { + Index_ start = __ldg(row_ind + tid); + + Index_ ci, cj; + bool ci_mod = false; + ci = labels[global_id]; + bool ci_allow_prop = filter_op(global_id); + + Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind); + /// TODO: add one element to row_ind and avoid get_stop_idx + for (Index_ j = start; j < end; j++) { + Index_ j_ind = __ldg(row_ind_ptr + j); + cj = labels[j_ind]; + bool cj_allow_prop = filter_op(j_ind); + if (ci < cj && ci_allow_prop) { + if (sizeof(Index_) == 4) + atomicMin((int *)(labels + j_ind), ci); + else if (sizeof(Index_) == 8) + atomicMin((long long int *)(labels + j_ind), ci); + if (cj_allow_prop) *m = true; + } else if (ci > cj && cj_allow_prop) { + ci = cj; + ci_mod = true; + } + } + if (ci_mod) { + if (sizeof(Index_) == 4) + atomicMin((int *)(labels + global_id), ci); + else if (sizeof(Index_) == 8) + atomicMin((long long int *)(labels + global_id), ci); + if (ci_allow_prop) *m = true; + } + } +} + +template +__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, + Index_ MAX_LABEL, Lambda filter_op) { + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; + if (tid < N) { + if (filter_op(tid)) + labels[tid] = tid + 1; + else + labels[tid] = MAX_LABEL; + } +} // namespace sparse + +/** + * @brief Partial calculation of the weakly connected components in the + * context of a batched algorithm: the labels are computed wrt the sub-graph + * represented by the given CSR matrix of dimensions batch_size * N. + * Note that this overwrites the labels array and it is the responsibility of + * the caller to combine the results from different batches + * (cf label/merge_labels.cuh) + * + * @tparam Index_ the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param start_vertex_id the starting vertex index for the current batch + * @param batch_size number of vertices for current batch + * @param state instance of inter-batch state management + * @param stream the cuda stream to use + * @param filter_op an optional filtering function to determine which points + * should get considered for labeling. It gets global indexes (not batch-wide!) + */ +template bool> +void weak_cc_batched(Index_ *labels, const Index_ *row_ind, + const Index_ *row_ind_ptr, Index_ nnz, Index_ N, + Index_ start_vertex_id, Index_ batch_size, + WeakCCState *state, cudaStream_t stream, + Lambda filter_op) { + ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, + "Index_ should be 4 or 8 bytes"); + + bool host_m; + + Index_ MAX_LABEL = std::numeric_limits::max(); + weak_cc_init_all_kernel + <<>>( + labels, N, MAX_LABEL, filter_op); + CUDA_CHECK(cudaPeekAtLastError()); + + int n_iters = 0; + do { + CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream)); + + weak_cc_label_device + <<>>( + labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, + batch_size, N, filter_op); + CUDA_CHECK(cudaPeekAtLastError()); + + //** Updating m * + raft::update_host(&host_m, state->m, 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + n_iters++; + } while (host_m); +} + +/** + * @brief Partial calculation of the weakly connected components in the + * context of a batched algorithm: the labels are computed wrt the sub-graph + * represented by the given CSR matrix of dimensions batch_size * N. + * Note that this overwrites the labels array and it is the responsibility of + * the caller to combine the results from different batches + * (cf label/merge_labels.cuh) + * + * @tparam Index_ the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param start_vertex_id the starting vertex index for the current batch + * @param batch_size number of vertices for current batch + * @param state instance of inter-batch state management + * @param stream the cuda stream to use + */ +template +void weak_cc_batched(Index_ *labels, const Index_ *row_ind, + const Index_ *row_ind_ptr, Index_ nnz, Index_ N, + Index_ start_vertex_id, Index_ batch_size, + WeakCCState *state, cudaStream_t stream) { + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, + batch_size, state, stream, + [] __device__(Index_ tid) { return true; }); +} + +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Label package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param d_alloc: deviceAllocator to use for temp memory + * @param stream the cuda stream to use + * @param filter_op an optional filtering function to determine which points + * should get considered for labeling. It gets global indexes (not batch-wide!) + */ +template bool> +void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, + Index_ nnz, Index_ N, + std::shared_ptr d_alloc, + cudaStream_t stream, Lambda filter_op) { + raft::mr::device::buffer m(d_alloc, stream, 1); + + WeakCCState state(m.data()); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, + stream, filter_op); +} + +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Label package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param d_alloc: deviceAllocator to use for temp memory + * @param stream the cuda stream to use + */ +template +void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, + Index_ nnz, Index_ N, + std::shared_ptr d_alloc, + cudaStream_t stream) { + raft::mr::device::buffer m(d_alloc, stream, 1); + WeakCCState state(m.data()); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, + stream, [](Index_) { return true; }); +} + +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh new file mode 100644 index 0000000000..a0467b9566 --- /dev/null +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +// @TODO: Move this into sparse prims (coo_norm) +template +__global__ void compute_binary_row_norm_kernel( + value_t *out, const value_idx *__restrict__ coo_rows, + const value_t *__restrict__ data, value_idx nnz) { + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { + // We do conditional here only because it's + // possible there could be some stray zeros in + // the sparse structure and removing them would be + // more expensive. + atomicAdd(&out[coo_rows[i]], data[i] == 1.0); + } +} + +template +__global__ void compute_binary_warp_kernel(value_t *__restrict__ C, + const value_t *__restrict__ Q_norms, + const value_t *__restrict__ R_norms, + value_idx n_rows, value_idx n_cols, + expansion_f expansion_func) { + value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; + C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); +} + +template +void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms, + value_idx n_rows, value_idx n_cols, + expansion_f expansion_func, cudaStream_t stream) { + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_binary_warp_kernel<<>>( + C, Q_norms, R_norms, n_rows, n_cols, expansion_func); +} + +template +void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, + const value_t *Q_data, value_idx Q_nnz, + const value_idx *R_coo_rows, const value_t *R_data, + value_idx R_nnz, value_idx m, value_idx n, + cusparseHandle_t handle, + std::shared_ptr alloc, + cudaStream_t stream, expansion_f expansion_func) { + raft::mr::device::buffer Q_norms(alloc, stream, m); + raft::mr::device::buffer R_norms(alloc, stream, n); + CUDA_CHECK( + cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + + compute_binary_row_norm_kernel<<>>( + Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_binary_row_norm_kernel<<>>( + R_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, + stream); +} + +/** + * Jaccard distance using the expanded form: + * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k)) + */ +template +class jaccard_expanded_distances_t : public distances_t { + public: + explicit jaccard_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(config.allocator, config.stream, 0), + ip_dists(config) {} + + void compute(value_t *out_dists) { + ip_dists.compute(out_dists); + + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); + + raft::mr::device::buffer search_coo_rows( + config_->allocator, config_->stream, config_->a_nnz); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, + config_->stream); + + compute_bin_distance( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle, config_->allocator, config_->stream, + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + return 1 - (dot / (q_r_union - dot)); + }); + } + + ~jaccard_expanded_distances_t() = default; + + private: + const distances_config_t *config_; + raft::mr::device::buffer workspace; + ip_distances_t ip_dists; +}; + +/** + * Dice distance using the expanded form: + * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k)^2 + sum(y_k)^2)) + */ +template +class dice_expanded_distances_t : public distances_t { + public: + explicit dice_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(config.allocator, config.stream, 0), + ip_dists(config) {} + + void compute(value_t *out_dists) { + ip_dists.compute(out_dists); + + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); + + raft::mr::device::buffer search_coo_rows( + config_->allocator, config_->stream, config_->a_nnz); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, + config_->stream); + + compute_bin_distance( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle, config_->allocator, config_->stream, + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = (q_norm * q_norm) + (r_norm * r_norm); + return (2 * dot) / q_r_union; + }); + } + + ~dice_expanded_distances_t() = default; + + private: + const distances_config_t *config_; + raft::mr::device::buffer workspace; + ip_distances_t ip_dists; +}; + +}; // END namespace distance +}; // END namespace sparse +}; // END namespace raft diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h new file mode 100644 index 0000000000..712d2c52bd --- /dev/null +++ b/cpp/include/raft/sparse/distance/common.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace sparse { +namespace distance { + +template +struct distances_config_t { + // left side + value_idx a_nrows; + value_idx a_ncols; + value_idx a_nnz; + value_idx *a_indptr; + value_idx *a_indices; + value_t *a_data; + + // right side + value_idx b_nrows; + value_idx b_ncols; + value_idx b_nnz; + value_idx *b_indptr; + value_idx *b_indices; + value_t *b_data; + + cusparseHandle_t handle; + + std::shared_ptr allocator; + cudaStream_t stream; +}; + +template +class distances_t { + public: + virtual void compute(value_t *out) {} + virtual ~distances_t() = default; +}; + +}; // namespace distance +} // namespace sparse +}; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh new file mode 100644 index 0000000000..d596c6b852 --- /dev/null +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +/** + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +__global__ void balanced_coo_generalized_spmv_kernel( + value_idx *indptrA, value_idx *indicesA, value_t *dataA, value_idx *rowsB, + value_idx *indicesB, value_t *dataB, value_idx m, value_idx n, value_idx dim, + value_idx nnz_b, value_t *out, int n_blocks_per_row, int chunk_size, + product_f product_func, accum_f accum_func, write_f write_func) { + typedef cub::WarpReduce warp_reduce; + + value_idx cur_row_a = blockIdx.x / n_blocks_per_row; + value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; + + // chunk starting offset + value_idx ind_offset = cur_chunk_offset * chunk_size * tpb; + // how many total cols will be processed by this block (should be <= chunk_size * n_threads) + value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); + + int tid = threadIdx.x; + int warp_id = tid / raft::warp_size(); + + // compute id relative to current warp + unsigned int lane_id = tid & (raft::warp_size() - 1); + value_idx ind = ind_offset + threadIdx.x; + + extern __shared__ char smem[]; + + value_idx *offsets_a = (value_idx *)smem; + kv_t *A = (kv_t *)(offsets_a + 2); + typename warp_reduce::TempStorage *temp_storage = + (typename warp_reduce::TempStorage *)(A + dim); + + // Create dense vector A and populate with 0s + for (int k = tid; k < dim; k += blockDim.x) A[k] = 0; + + if (tid == 0) { + offsets_a[0] = indptrA[cur_row_a]; + offsets_a[1] = indptrA[cur_row_a + 1]; + } + + __syncthreads(); + + value_idx start_offset_a = offsets_a[0]; + value_idx stop_offset_a = offsets_a[1]; + + // Convert current row vector in A to dense + for (int i = tid; i < (stop_offset_a - start_offset_a); i += blockDim.x) { + A[indicesA[start_offset_a + i]] = dataA[start_offset_a + i]; + } + + __syncthreads(); + + if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return; + if (ind >= nnz_b) return; + + value_idx cur_row_b = -1; + value_t c = 0.0; + + auto warp_red = warp_reduce(*(temp_storage + warp_id)); + + // coalesced reads from B + if (tid < active_chunk_size) { + cur_row_b = rowsB[ind]; + value_t a_col = A[indicesB[ind]]; + if (!rev || a_col == 0.0) c = product_func(a_col, dataB[ind]); + } + + // loop through chunks in parallel, reducing when a new row is + // encountered by each thread + for (int i = tid; i < active_chunk_size; i += blockDim.x) { + value_idx ind_next = ind + blockDim.x; + value_idx next_row_b = -1; + + if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; + + bool diff_rows = next_row_b != cur_row_b; + + if (__any_sync(0xffffffff, diff_rows)) { + // grab the threads currently participating in loops. + // because any other threads should have returned already. + unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + + // thread with lowest lane id among peers writes out + if (is_leader && v != 0.0) { + // this conditional should be uniform, since rev is constant + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b + : (size_t)cur_row_b * m + cur_row_a; + write_func(out + idx, v); + } + + c = 0.0; + } + + if (next_row_b != -1) { + ind = ind_next; + value_t a_col = A[indicesB[ind]]; + if (!rev || a_col == 0.0) + c = accum_func(c, product_func(a_col, dataB[ind])); + cur_row_b = next_row_b; + } + } +} + +/** + * Computes the maximum number of columns that can be stored + * in shared memory in dense form with the given block size + * and precision. + * @return the maximum number of columns that can be stored in smem + */ +template +inline int max_cols_per_block() { + // max cols = (total smem available - offsets for A - cub reduction smem) + return (raft::getSharedMemPerBlock() - (2 * sizeof(value_idx)) - + ((tpb / raft::warp_size()) * sizeof(value_t))) / + sizeof(value_t); +} + +template +inline int smem_per_block(int n_cols) { + int max_cols = max_cols_per_block(); + ASSERT(n_cols <= max_cols, "COO SPMV Requires max dimensionality of %d", + max_cols); + return (n_cols * sizeof(value_t)) + (2 * sizeof(value_idx)) + + ((tpb / raft::warp_size()) * sizeof(value_t)); +} + +/** + * Performs generalized sparse-matrix-sparse-matrix multiplication via a + * sparse-matrix-sparse-vector layout `out=A*B` where generalized product() + * and sum() operations can be used in place of the standard sum and product: + * + * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of + * k=0..n_cols-1 where B_kj is nonzero. + * + * The product and sum operations shall form a semiring algebra with the + * following properties: + * 1. {+, 0} is a commutative sum reduction monoid with identity element 0 + * 2. {*, 1} is a product monoid with identity element 1 + * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0 + * + * Each vector of A is loaded into shared memory in dense form and the + * non-zeros of B load balanced across the threads of each block. + * @tparam value_idx index type + * @tparam value_t value type + * @tparam threads_per_block block size + * @tparam chunk_size number of nonzeros of B to process for each row of A + * this value was found through profiling and represents a reasonable + * setting for both large and small densities + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @tparam write_f atomic semiring sum() function + * @param[out] out_dists dense array of out distances of size m * n in row-major + * format. + * @param[in] config_ distance config object + * @param[in] coo_rows_b coo row array for B + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +inline void balanced_coo_pairwise_generalized_spmv( + value_t *out_dists, const distances_config_t &config_, + value_idx *coo_rows_b, product_f product_func, accum_f accum_func, + write_f write_func) { + CUDA_CHECK(cudaMemsetAsync( + out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.stream)); + int n_blocks_per_row = + raft::ceildiv(config_.b_nnz, chunk_size * threads_per_block); + int n_blocks = config_.a_nrows * n_blocks_per_row; + + int smem = + smem_per_block(config_.a_ncols); + + CUDA_CHECK(cudaFuncSetCacheConfig( + balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); + + balanced_coo_generalized_spmv_kernel + <<>>( + config_.a_indptr, config_.a_indices, config_.a_data, coo_rows_b, + config_.b_indices, config_.b_data, config_.a_nrows, config_.b_nrows, + config_.b_ncols, config_.b_nnz, out_dists, n_blocks_per_row, chunk_size, + product_func, accum_func, write_func); +}; + +/** + * Used for computing distances where the reduction (e.g. product()) function + * requires an implicit union (product(x, 0) = x) to capture the difference A-B. + * This is necessary in some applications because the standard semiring algebra + * endowed with the default multiplication product monoid will only + * compute the intersection & B-A. + * + * This particular function is meant to accompany the function + * `balanced_coo_pairwise_generalized_spmv` and executes the product operation + * on only those columns that exist in B and not A. + * + * The product and sum operations shall enable the computation of a + * non-annihilating semiring algebra with the following properties: + * 1. {+, 0} is a commutative sum reduction monoid with identity element 0 + * 2. {*, 0} is a product monoid with identity element 0 + * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x + * + * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of + * execution pattern is necessary. + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam threads_per_block block size + * @tparam chunk_size number of nonzeros of B to process for each row of A + * this value was found through profiling and represents a reasonable + * setting for both large and small densities + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @tparam write_f atomic semiring sum() function + * @param[out] out_dists dense array of out distances of size m * n + * @param[in] config_ distance config object + * @param[in] coo_rows_a coo row array for A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +inline void balanced_coo_pairwise_generalized_spmv_rev( + value_t *out_dists, const distances_config_t &config_, + value_idx *coo_rows_a, product_f product_func, accum_f accum_func, + write_f write_func) { + int n_blocks_per_row = + raft::ceildiv(config_.a_nnz, chunk_size * threads_per_block); + int n_blocks = config_.b_nrows * n_blocks_per_row; + + int smem = + smem_per_block(config_.a_ncols); + + CUDA_CHECK(cudaFuncSetCacheConfig( + balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); + + balanced_coo_generalized_spmv_kernel + <<>>( + config_.b_indptr, config_.b_indices, config_.b_data, coo_rows_a, + config_.a_indices, config_.a_data, config_.b_nrows, config_.a_nrows, + config_.a_ncols, config_.a_nnz, out_dists, n_blocks_per_row, chunk_size, + product_func, accum_func, write_func); +}; +} // namespace distance +} // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/distance/csr_spmv.cuh b/cpp/include/raft/sparse/distance/csr_spmv.cuh new file mode 100644 index 0000000000..eff8f9281e --- /dev/null +++ b/cpp/include/raft/sparse/distance/csr_spmv.cuh @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +/** + * Semiring which schedules each row of B in a different thread. + * @tparam value_idx + * @tparam value_t + * @tparam tpb + * @tparam buffer_size + * @tparam rows_per_block + */ +template +struct BlockSemiring { + __device__ inline BlockSemiring(value_idx n_, value_idx *shared_cols_, + value_t *shared_vals_, value_idx *offsets_a_) + : n(n_), + a_cols(shared_cols_), + a_vals(shared_vals_), + offsets_a(offsets_a_), + done(false), + a_idx(0), + b_row_count(0), + cur_sum(0.0) {} + + /** + * Load columns for a single row of A into shared memory + * @param row + * @param indptrA + * @param indicesA + * @param dataA + */ + __device__ inline void load_a_shared(value_idx row, value_idx *indptrA, + value_idx *indicesA, value_t *dataA) { + if (threadIdx.x == 0) { + offsets_a[0] = indptrA[row]; + offsets_a[1] = indptrA[row + 1]; + } + __syncthreads(); + + value_idx start_offset_a = offsets_a[0]; + value_idx stop_offset_a = offsets_a[1]; + + a_size = stop_offset_a - start_offset_a; + + // Coalesce reads of row from matrix A into shared memory + for (int i = threadIdx.x; i < a_size; i += blockDim.x) { + a_cols[i] = indicesA[start_offset_a + i]; + a_vals[i] = dataA[start_offset_a + i]; + } + + __syncthreads(); + + row_a = row; + } + + /** + * Sets the head for A's pointers so they can be + * iterated in each thread. This is used for the + * case when the maximum degree of any row in A + * is too large to fit into shared memory, so we + * default to increasing the size of the L1 cache + * and suffering the uncoalesced memory accesses + * for both A and B. + * @param row + * @param indptrA + * @param indicesA + * @param dataA + */ + __device__ inline void load_a(value_idx row, value_idx *indptrA, + value_idx *indicesA, value_t *dataA) { + offsets_a[0] = indptrA[row]; + offsets_a[1] = indptrA[row + 1]; + + value_idx start_offset_a = offsets_a[0]; + value_idx stop_offset_a = offsets_a[1]; + + a_size = stop_offset_a - start_offset_a; + + a_cols = indicesA + start_offset_a; + a_vals = dataA + start_offset_a; + + row_a = row; + } + + /** + * Prepare index & offsets for looping through rows of B + * @param start_row + * @param indptrB + */ + __device__ inline void load_b(value_idx start_row, value_idx *indptrB) { + done = false; + a_idx = 0; + cur_sum = 0.0; + + value_idx start_row_b = start_row; + value_idx stop_row_b = min(start_row_b + tpb, n); + + n_rows_b = stop_row_b - start_row_b; + + if (threadIdx.x < n_rows_b) { + row_b = start_row_b + threadIdx.x; + value_idx start_offset_b = indptrB[row_b]; + b_row_count = indptrB[row_b + 1] - start_offset_b; + b_idx = start_offset_b; + b_idx_stop = start_offset_b + b_row_count; + } + } + + /** + * Perform single single column intersection/union for A & B + * based on the row of A mapped to shared memory and the row + * of B mapped to current thread. + * @param product_func + * @param accum_func + */ + __device__ inline void step(value_idx *b_cols, value_t *b_vals, + product_f product_func, accum_f accum_func) { + if (threadIdx.x < n_rows_b) { + bool local_idx_in_bounds = b_idx < b_idx_stop && b_row_count > 0; + + value_idx b = local_idx_in_bounds ? b_cols[b_idx] : -1; + value_t bv = local_idx_in_bounds ? b_vals[b_idx] : 0.0; + + bool a_idx_in_bounds = a_idx < a_size; + + value_idx a = a_idx_in_bounds ? a_cols[a_idx] : -1; + value_t av = a_idx_in_bounds ? a_vals[a_idx] : 0.0; + + bool run_b = ((b <= a && b != -1) || (b != -1 && a == -1)); + b_idx += 1 * run_b; + value_t b_side = bv * run_b; + + bool run_a = ((a <= b && a != -1) || (b == -1 && a != -1)); + a_idx += 1 * run_a; + value_t a_side = av * run_a; + + // Apply semiring "sum" & "product" functions locally + cur_sum = accum_func(cur_sum, product_func(b_side, a_side)); + + // finished when all items in chunk have been + // processed + done = b == -1 && a == -1; + + } else { + done = true; + } + } + + __device__ inline bool isdone() { return done; } + + __device__ inline void write(value_t *out) { + if (threadIdx.x < n_rows_b) { + out[(size_t)row_a * n + row_b] = cur_sum; + } + } + + private: + bool done; + + int a_size; + + value_idx n_rows_b; + + value_idx b_idx; + value_idx b_idx_stop; + value_idx a_idx; + + value_t cur_sum; + + value_idx n; + + value_idx row_a; + value_idx row_b; + + value_idx *offsets_a; + + // shared memory + value_idx b_row_count; + value_idx *a_cols; + value_t *a_vals; +}; + +/** + * Optimized for large numbers of rows but small enough numbers of columns + * that each thread can process their rows in parallel. + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb block size + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @param[in] indptrA csr column index pointer array for A + * @param[in] indicesA csr column indices array for A + * @param[in] dataA csr data array for A + * @param[in] indptrB csr column index pointer array for B + * @param[in] indicesB csr column indices array for B + * @param[in] dataB csr data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[out] out dense output array of size m * n in row-major layout + * @param[in] n_blocks_per_row number of blocks of B scheduled per row of A + * @param[in] n_rows_per_block number of rows of A scheduled per block of B + * @param[in] buffer_size number of nonzeros to store in smem + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + */ +template +__global__ void classic_csr_semiring_spmv_smem_kernel( + value_idx *indptrA, value_idx *indicesA, value_t *dataA, value_idx *indptrB, + value_idx *indicesB, value_t *dataB, value_idx m, value_idx n, value_t *out, + int n_blocks_per_row, int n_rows_per_block, int buffer_size, + product_f product_func, accum_f accum_func) { + value_idx out_row = blockIdx.x / n_blocks_per_row; + value_idx out_col_start = blockIdx.x % n_blocks_per_row; + + value_idx row_b_start = out_col_start * n_rows_per_block; + + extern __shared__ char smem[]; + + value_idx *offsets_a = (value_idx *)smem; + value_idx *a_cols = offsets_a + 2; + value_t *a_vals = (value_t *)(a_cols + buffer_size); + + BlockSemiring semiring( + n, a_cols, a_vals, offsets_a); + + semiring.load_a_shared(out_row, indptrA, indicesA, dataA); + + if (out_row > m || row_b_start > n) return; + + // for each batch, parallelize the resulting rows across threads + for (int i = 0; i < n_rows_per_block; i += blockDim.x) { + semiring.load_b(row_b_start + i, indptrB); + do { + semiring.step(indicesB, dataB, product_func, accum_func); + } while (!semiring.isdone()); + + semiring.write(out); + } +} + +template +__global__ void classic_csr_semiring_spmv_kernel( + value_idx *indptrA, value_idx *indicesA, value_t *dataA, value_idx *indptrB, + value_idx *indicesB, value_t *dataB, value_idx m, value_idx n, value_t *out, + int n_blocks_per_row, int n_rows_per_block, product_f product_func, + accum_f accum_func) { + value_idx out_row = blockIdx.x / n_blocks_per_row; + value_idx out_col_start = blockIdx.x % n_blocks_per_row; + + value_idx row_b_start = out_col_start * n_rows_per_block; + + value_idx offsets_a[2]; + + BlockSemiring semiring( + n, indicesA, dataA, offsets_a); + + semiring.load_a(out_row, indptrA, indicesA, dataA); + + if (out_row > m || row_b_start > n) return; + + // for each batch, parallel the resulting rows across threads + for (int i = 0; i < n_rows_per_block; i += blockDim.x) { + semiring.load_b(row_b_start + i, indptrB); + do { + semiring.step(indicesB, dataB, product_func, accum_func); + } while (!semiring.isdone()); + + semiring.write(out); + } +} + +/** + * Compute the maximum number of nonzeros that can be stored in shared + * memory per block with the given index and value precision + * @return max nnz that can be stored in smem per block + */ +template +inline value_idx max_nnz_per_block() { + // max nnz = total smem - offsets for A + // (division because we need to store cols & vals separately) + return (raft::getSharedMemPerBlock() - (2 * sizeof(value_idx))) / + (sizeof(value_t) + sizeof(value_idx)); +} + +/** + * @tparam value_idx + * @param out + * @param in + * @param n + */ +template +__global__ void max_kernel(value_idx *out, value_idx *in, value_idx n) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + value_idx v = tid < n ? in[tid] - in[tid - 1] : 0; + value_idx agg = BlockReduce(temp_storage).Reduce(v, cub::Max()); + + if (threadIdx.x == 0) atomicMax(out, agg); +} + +template +inline value_idx max_degree( + value_idx *indptr, value_idx n_rows, + std::shared_ptr allocator, cudaStream_t stream) { + raft::mr::device::buffer max_d(allocator, stream, 1); + CUDA_CHECK(cudaMemsetAsync(max_d.data(), 0, sizeof(value_idx), stream)); + + /** + * A custom max reduction is performed until https://github.com/rapidsai/cuml/issues/3431 + * is fixed. + */ + max_kernel<<>>( + max_d.data(), indptr + 1, n_rows); + + value_idx max_h; + raft::update_host(&max_h, max_d.data(), 1, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + return max_h; +} + +template +void _generalized_csr_pairwise_semiring( + value_t *out_dists, const distances_config_t &config_, + product_f product_func, accum_f accum_func) { + int n_chunks = 1; + int n_rows_per_block = min(n_chunks * threads_per_block, config_.b_nrows); + int n_blocks_per_row = raft::ceildiv(config_.b_nrows, n_rows_per_block); + int n_blocks = config_.a_nrows * n_blocks_per_row; + + CUDA_CHECK(cudaFuncSetCacheConfig( + classic_csr_semiring_spmv_kernel, + cudaFuncCachePreferL1)); + + classic_csr_semiring_spmv_kernel + <<>>( + config_.a_indptr, config_.a_indices, config_.a_data, config_.b_indptr, + config_.b_indices, config_.b_data, config_.a_nrows, config_.b_nrows, + out_dists, n_blocks_per_row, n_rows_per_block, product_func, accum_func); +}; + +template +void _generalized_csr_pairwise_smem_semiring( + value_t *out_dists, const distances_config_t &config_, + product_f product_func, accum_f accum_func, value_idx max_nnz) { + int n_chunks = 10000; + int n_rows_per_block = min(n_chunks * threads_per_block, config_.b_nrows); + int n_blocks_per_row = raft::ceildiv(config_.b_nrows, n_rows_per_block); + int n_blocks = config_.a_nrows * n_blocks_per_row; + + // TODO: Figure out why performance is worse with smaller smem sizes + int smem_size = raft::getSharedMemPerBlock(); + + CUDA_CHECK(cudaFuncSetCacheConfig( + classic_csr_semiring_spmv_smem_kernel, + cudaFuncCachePreferShared)); + + classic_csr_semiring_spmv_smem_kernel + <<>>( + config_.a_indptr, config_.a_indices, config_.a_data, config_.b_indptr, + config_.b_indices, config_.b_data, config_.a_nrows, config_.b_nrows, + out_dists, n_blocks_per_row, n_rows_per_block, max_nnz, product_func, + accum_func); +} + +/** + * Perform generalized sparse-matrix-sparse-vector multiply in + * a semiring algebra by allowing the product and sum operations + * to be defined. This approach saves the most memory as it can + * work directly on a CSR w/o the need for conversion to another + * sparse format, does not require any transposition, nor loading + * any vectors in dense form. The major drawback to this kernel + * is that the non-uniform memory access pattern dominates performance. + * When the shared memory option is used, bank conflicts also dominate + * performance, making it slower than other options but guaranteeing + * that the product() operation will be executed across every column + * in A and B. + * + * This is primarily useful when in cases where the product() operation + * is non-anniliating (e.g. product(x, 0) = x. + * + * There are two potential code paths for this primitive- if the largest + * degree of any row is small enough to fit in shared memory then shared + * memory is used to coalesce the reads from the vectors of A, otherwise + * no shared memory is used and all loads from A and B happen independently + * in separate threads. + * + * Iterators are maintained for the vectors from both A and B and each + * thread iterates to a maximum of |a|+|b| (which will happen only when + * the set of columns for vectors a and b are completely disjoint. + * + * TODO: Some potential things to try for future optimizations: + * - Always iterating for n_cols so that each warp is iterating + * a uniform number of times. + * - Computing an argsort() of B based on the number of columns + * in each row to attempt to load balance the warps naturally + * - Finding a way to coalesce the reads + * + * Ref: https://github.com/rapidsai/cuml/issues/3371 + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @param[out] out_dists dense array of output distances size m * n in row-major layout + * @param[in] config_ distance config object + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + */ +template +void generalized_csr_pairwise_semiring( + value_t *out_dists, const distances_config_t &config_, + product_f product_func, accum_f accum_func) { + int nnz_upper_bound = max_nnz_per_block(); + + // max_nnz set from max(diff(indptrA)) + value_idx max_nnz = max_degree(config_.a_indptr, config_.a_nrows, + config_.allocator, config_.stream) + + 1; + + if (max_nnz <= nnz_upper_bound) + // use smem + _generalized_csr_pairwise_smem_semiring( + out_dists, config_, product_func, accum_func, max_nnz); + + else + // load each row of A separately + _generalized_csr_pairwise_semiring( + out_dists, config_, product_func, accum_func); +}; + +} // namespace distance +} // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh new file mode 100644 index 0000000000..1559e9776f --- /dev/null +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +/** + * Compute pairwise distances between A and B, using the provided + * input configuration and distance function. + * + * @tparam value_idx index type + * @tparam value_t value type + * @param[out] out dense output array (size A.nrows * B.nrows) + * @param[in] input_config input argument configuration + * @param[in] metric distance metric to use + */ +template +void pairwiseDistance(value_t *out, + distances_config_t input_config, + raft::distance::DistanceType metric, float metric_arg) { + switch (metric) { + case raft::distance::DistanceType::L2Expanded: + l2_expanded_distances_t(input_config).compute(out); + break; + case raft::distance::DistanceType::InnerProduct: + ip_distances_t(input_config).compute(out); + break; + case raft::distance::DistanceType::L2Unexpanded: + l2_unexpanded_distances_t(input_config).compute(out); + break; + case raft::distance::DistanceType::L1: + l1_unexpanded_distances_t(input_config).compute(out); + break; + case raft::distance::DistanceType::LpUnexpanded: + lp_unexpanded_distances_t(input_config, metric_arg) + .compute(out); + break; + case raft::distance::DistanceType::Linf: + linf_unexpanded_distances_t(input_config) + .compute(out); + break; + case raft::distance::DistanceType::Canberra: + canberra_unexpanded_distances_t(input_config) + .compute(out); + break; + case raft::distance::DistanceType::JaccardExpanded: + jaccard_expanded_distances_t(input_config) + .compute(out); + break; + case raft::distance::DistanceType::CosineExpanded: + cosine_expanded_distances_t(input_config) + .compute(out); + break; + case raft::distance::DistanceType::HellingerExpanded: + hellinger_expanded_distances_t(input_config) + .compute(out); + break; + + default: + THROW("Unsupported distance: %d", metric); + } +} + +}; // namespace distance +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh new file mode 100644 index 0000000000..a832c2b6a9 --- /dev/null +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +/** + * A simple interface that enables different instances + * of inner product. Currently, there are two implementations: + * cusparse gemm and our own semiring spmv. + * @tparam value_idx + * @tparam value_t + */ +template +class ip_trans_getters_t : public distances_t { + public: + /** + * A copy of B's data in coo format. This is + * useful for downstream distances that + * might be able to compute a norm instead of + * point-wise products. + * @return + */ + virtual value_t *b_data_coo() = 0; + + /** + * A copy of B's rows in coo format. This is + * useful for downstream distances that + * might be able to compute a norm instead of + * point-wise products. + * @return + */ + virtual value_idx *b_rows_coo() = 0; + + virtual ~ip_trans_getters_t() = default; +}; + +/** + * Simple inner product distance with sparse matrix multiply. This + * uses cusparse and requires both B to be transposed as well as + * the output to be explicitly converted to dense form (which requires + * 3 copies of the dense data- 2 for the cusparse csr output and + * 1 for the final m*n dense matrix.) + */ +template +class ip_distances_gemm_t : public ip_trans_getters_t { + public: + /** + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + * + * TODO: Remove this once we have a semiring SPGEMM + * Ref: https://github.com/rapidsai/cuml/issues/3371 + */ + explicit ip_distances_gemm_t( + const distances_config_t &config) + : config_(&config), + workspace(config.allocator, config.stream, 0), + csc_indptr(config.allocator, config.stream, 0), + csc_indices(config.allocator, config.stream, 0), + csc_data(config.allocator, config.stream, 0), + alpha(1.0) { + init_mat_descriptor(matA); + init_mat_descriptor(matB); + init_mat_descriptor(matC); + init_mat_descriptor(matD); + + CUSPARSE_CHECK(cusparseCreateCsrgemm2Info(&info)); + + CUSPARSE_CHECK(cusparseGetPointerMode(config.handle, &orig_ptr_mode)); + + CUSPARSE_CHECK( + cusparseSetPointerMode(config.handle, CUSPARSE_POINTER_MODE_HOST)); + } + + /** + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t *out_distances) { + /** + * Compute pairwise distances and return dense matrix in column-major format + */ + raft::mr::device::buffer out_batch_indptr( + config_->allocator, config_->stream, config_->a_nrows + 1); + raft::mr::device::buffer out_batch_indices(config_->allocator, + config_->stream, 0); + raft::mr::device::buffer out_batch_data(config_->allocator, + config_->stream, 0); + + value_idx out_batch_nnz = get_nnz(out_batch_indptr.data()); + + out_batch_indices.resize(out_batch_nnz, config_->stream); + out_batch_data.resize(out_batch_nnz, config_->stream); + + compute_gemm(out_batch_indptr.data(), out_batch_indices.data(), + out_batch_data.data()); + + raft::sparse::convert::csr_to_dense( + config_->handle, config_->a_nrows, config_->b_nrows, + out_batch_indptr.data(), out_batch_indices.data(), out_batch_data.data(), + config_->a_nrows, out_distances, config_->stream, true); + } + + virtual value_idx *b_rows_coo() { return csc_indices.data(); } + + value_t *b_data_coo() { return csc_data.data(); } + + ~ip_distances_gemm_t() { + CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matA)); + CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matB)); + CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matC)); + CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matD)); + + CUSPARSE_CHECK_NO_THROW( + cusparseSetPointerMode(config_->handle, orig_ptr_mode)); + } + + private: + void init_mat_descriptor(cusparseMatDescr_t &mat) { + CUSPARSE_CHECK(cusparseCreateMatDescr(&mat)); + CUSPARSE_CHECK(cusparseSetMatIndexBase(mat, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseSetMatType(mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + } + + value_idx get_nnz(value_idx *csr_out_indptr) { + value_idx m = config_->a_nrows, n = config_->b_nrows, k = config_->a_ncols; + + transpose_b(); + + size_t workspace_size; + + CUSPARSE_CHECK(raft::sparse::cusparsecsrgemm2_buffersizeext( + config_->handle, m, n, k, &alpha, NULL, matA, config_->a_nnz, + config_->a_indptr, config_->a_indices, matB, config_->b_nnz, + csc_indptr.data(), csc_indices.data(), matD, 0, NULL, NULL, info, + &workspace_size, config_->stream)); + + workspace.resize(workspace_size, config_->stream); + + value_idx out_nnz = 0; + + CUSPARSE_CHECK(raft::sparse::cusparsecsrgemm2nnz( + config_->handle, m, n, k, matA, config_->a_nnz, config_->a_indptr, + config_->a_indices, matB, config_->b_nnz, csc_indptr.data(), + csc_indices.data(), matD, 0, NULL, NULL, matC, csr_out_indptr, &out_nnz, + info, workspace.data(), config_->stream)); + + return out_nnz; + } + + void compute_gemm(const value_idx *csr_out_indptr, value_idx *csr_out_indices, + value_t *csr_out_data) { + value_idx m = config_->a_nrows, n = config_->b_nrows, k = config_->a_ncols; + + int start = raft::curTimeMillis(); + + CUDA_CHECK(cudaStreamSynchronize(config_->stream)); + + CUSPARSE_CHECK(raft::sparse::cusparsecsrgemm2( + config_->handle, m, n, k, &alpha, matA, config_->a_nnz, config_->a_data, + config_->a_indptr, config_->a_indices, matB, config_->b_nnz, + csc_data.data(), csc_indptr.data(), csc_indices.data(), NULL, matD, 0, + NULL, NULL, NULL, matC, csr_out_data, csr_out_indptr, csr_out_indices, + info, workspace.data(), config_->stream)); + + CUDA_CHECK(cudaStreamSynchronize(config_->stream)); + } + + void transpose_b() { + /** + * Transpose index array into csc + */ + csc_indptr.resize(config_->b_ncols + 1, config_->stream); + csc_indices.resize(config_->b_nnz, config_->stream); + csc_data.resize(config_->b_nnz, config_->stream); + + raft::sparse::linalg::csr_transpose( + config_->handle, config_->b_indptr, config_->b_indices, config_->b_data, + csc_indptr.data(), csc_indices.data(), csc_data.data(), config_->b_nrows, + config_->b_ncols, config_->b_nnz, config_->allocator, config_->stream); + } + + value_t alpha; + csrgemm2Info_t info; + cusparseMatDescr_t matA; + cusparseMatDescr_t matB; + cusparseMatDescr_t matC; + cusparseMatDescr_t matD; + cusparsePointerMode_t orig_ptr_mode; + raft::mr::device::buffer workspace; + raft::mr::device::buffer csc_indptr; + raft::mr::device::buffer csc_indices; + raft::mr::device::buffer csc_data; + const distances_config_t *config_; +}; + +template +class ip_distances_spmv_t : public ip_trans_getters_t { + public: + /** + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + */ + ip_distances_spmv_t(const distances_config_t &config) + : config_(&config), + coo_rows_b(config.allocator, config.stream, config.b_nnz) { + raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, + coo_rows_b.data(), config_->b_nnz, + config_->stream); + } + + /** + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t *out_distances) { + /** + * Compute pairwise distances and return dense matrix in row-major format + */ + balanced_coo_pairwise_generalized_spmv( + out_distances, *config_, coo_rows_b.data(), Product(), Sum(), + AtomicAdd()); + } + + value_idx *b_rows_coo() { return coo_rows_b.data(); } + + value_t *b_data_coo() { return config_->b_data; } + + ~ip_distances_spmv_t() = default; + + private: + const distances_config_t *config_; + raft::mr::device::buffer coo_rows_b; +}; + +template +class ip_distances_t : public distances_t { + public: + /** + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + */ + explicit ip_distances_t(const distances_config_t &config) + : config_(&config) { + if (config_->a_ncols < max_cols_per_block()) { + internal_ip_dist = + std::make_unique>(*config_); + } else { + internal_ip_dist = + std::make_unique>(*config_); + } + } + + /** + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t *out_distances) { + /** + * Compute pairwise distances and return dense matrix in column-major format + */ + internal_ip_dist->compute(out_distances); + } + + virtual value_idx *b_rows_coo() const { + return internal_ip_dist->b_rows_coo(); + } + + virtual value_t *b_data_coo() const { return internal_ip_dist->b_data_coo(); } + + private: + const distances_config_t *config_; + std::unique_ptr> internal_ip_dist; +}; + +/** + * Compute pairwise distances between A and B, using the provided + * input configuration and distance function. + * + * @tparam value_idx index type + * @tparam value_t value type + * @param[out] out dense output array (size A.nrows * B.nrows) + * @param[in] input_config input argument configuration + * @param[in] metric distance metric to use + */ +template class ip_distances_t; +template class distances_config_t; + +}; // END namespace distance +}; // END namespace sparse +}; // END namespace raft diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh new file mode 100644 index 0000000000..9d481e34ef --- /dev/null +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +// @TODO: Move this into sparse prims (coo_norm) +template +__global__ void compute_row_norm_kernel(value_t *out, + const value_idx *__restrict__ coo_rows, + const value_t *__restrict__ data, + value_idx nnz) { + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { + atomicAdd(&out[coo_rows[i]], data[i] * data[i]); + } +} + +template +__global__ void compute_euclidean_warp_kernel( + value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, + const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols, + expansion_f expansion_func) { + value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t dot = C[(size_t)i * n_cols + j]; + + // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm + value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]); + + // correct for small instabilities + if (fabs(val) < 0.0001) val = 0.0; + + C[(size_t)i * n_cols + j] = val; +} + +template +void compute_euclidean(value_t *C, const value_t *Q_sq_norms, + const value_t *R_sq_norms, value_idx n_rows, + value_idx n_cols, cudaStream_t stream, + expansion_f expansion_func) { + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_euclidean_warp_kernel<<>>( + C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); +} + +template +void compute_l2(value_t *out, const value_idx *Q_coo_rows, + const value_t *Q_data, value_idx Q_nnz, + const value_idx *R_coo_rows, const value_t *R_data, + value_idx R_nnz, value_idx m, value_idx n, + cusparseHandle_t handle, + std::shared_ptr alloc, + cudaStream_t stream, expansion_f expansion_func) { + raft::mr::device::buffer Q_sq_norms(alloc, stream, m); + raft::mr::device::buffer R_sq_norms(alloc, stream, n); + CUDA_CHECK( + cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + + compute_row_norm_kernel<<>>( + Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_norm_kernel<<>>( + R_sq_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, + expansion_func); +} + +/** + * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k) + * The expanded form is more efficient for sparse data. + */ +template +class l2_expanded_distances_t : public distances_t { + public: + explicit l2_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(config.allocator, config.stream, 0), + ip_dists(config) {} + + void compute(value_t *out_dists) { + ip_dists.compute(out_dists); + + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); + + raft::mr::device::buffer search_coo_rows( + config_->allocator, config_->stream, config_->a_nnz); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, + config_->stream); + + compute_l2( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle, config_->allocator, config_->stream, + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); + } + + ~l2_expanded_distances_t() = default; + + private: + const distances_config_t *config_; + raft::mr::device::buffer workspace; + ip_distances_t ip_dists; +}; + +/** + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2))) + * The expanded form is more efficient for sparse data. + */ +template +class cosine_expanded_distances_t : public distances_t { + public: + explicit cosine_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(config.allocator, config.stream, 0), + ip_dists(config) {} + + void compute(value_t *out_dists) { + ip_dists.compute(out_dists); + + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); + + raft::mr::device::buffer search_coo_rows( + config_->allocator, config_->stream, config_->a_nnz); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, + config_->stream); + + compute_l2( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle, config_->allocator, config_->stream, + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = sqrt(q_norm) * sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + return 1 - cos; + }); + } + + ~cosine_expanded_distances_t() = default; + + private: + const distances_config_t *config_; + raft::mr::device::buffer workspace; + ip_distances_t ip_dists; +}; + +/** + * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k))) + * The expanded form is more efficient for sparse data. + * + * This distance computation modifies A and B by computing a sqrt + * and then performing a `pow(x, 2)` to convert it back. Because of this, + * it is possible that the values in A and B might differ slightly + * after this is invoked. + */ +template +class hellinger_expanded_distances_t : public distances_t { + public: + explicit hellinger_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(config.allocator, config.stream, 0), + ip_dists(config) {} + + void compute(value_t *out_dists) { + // First sqrt A and B + raft::linalg::unaryOp( + config_->a_data, config_->a_data, config_->a_nnz, + [=] __device__(value_t input) { return sqrt(input); }, config_->stream); + + if (config_->a_data != config_->b_data) { + raft::linalg::unaryOp( + config_->b_data, config_->b_data, config_->b_nnz, + [=] __device__(value_t input) { return sqrt(input); }, config_->stream); + } + + ip_dists.compute(out_dists); + + // Revert sqrt of A and B + raft::linalg::unaryOp( + config_->a_data, config_->a_data, config_->a_nnz, + [=] __device__(value_t input) { return input * input; }, config_->stream); + if (config_->a_data != config_->b_data) { + raft::linalg::unaryOp( + config_->b_data, config_->b_data, config_->b_nnz, + [=] __device__(value_t input) { return input * input; }, + config_->stream); + } + + raft::linalg::unaryOp( + out_dists, out_dists, config_->a_nrows * config_->b_nrows, + [=] __device__(value_t input) { return sqrt(1 - input); }, + config_->stream); + } + + ~hellinger_expanded_distances_t() = default; + + private: + const distances_config_t *config_; + raft::mr::device::buffer workspace; + ip_distances_t ip_dists; +}; + +}; // END namespace distance +}; // END namespace sparse +}; // END namespace raft diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh new file mode 100644 index 0000000000..e991224f1b --- /dev/null +++ b/cpp/include/raft/sparse/distance/lp_distance.cuh @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace distance { + +template + +void unexpanded_lp_distances( + value_t *out_dists, const distances_config_t *config_, + product_f product_func, accum_f accum_func, write_f write_func) { + /** + * @TODO: Main logic here: + * + * - if n_cols < available smem, just use dense conversion for rows of A + * - if n_cols > available smem but max nnz < available smem, use hashing + * (not yet available) + * - if n_cols > available smem & max_nnz > available smem, + * use batching + hashing only for those large cols + * Ref: https://github.com/rapidsai/cuml/issues/3371 + */ + + if (config_->a_ncols < max_cols_per_block()) { + // TODO: Use n_cols to set shared memory and threads per block + // for max occupancy. + // Ref: https://github.com/rapidsai/cuml/issues/3371 + + raft::mr::device::buffer coo_rows( + config_->allocator, config_->stream, max(config_->b_nnz, config_->a_nnz)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, + coo_rows.data(), config_->b_nnz, + config_->stream); + + balanced_coo_pairwise_generalized_spmv( + out_dists, *config_, coo_rows.data(), product_func, accum_func, + write_func); + + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + coo_rows.data(), config_->a_nnz, + config_->stream); + + balanced_coo_pairwise_generalized_spmv_rev( + out_dists, *config_, coo_rows.data(), product_func, accum_func, + write_func); + + } else { + // TODO: Find max nnz and set smem based on this value. + // Ref: https://github.com/rapidsai/cuml/issues/3371 + generalized_csr_pairwise_semiring( + out_dists, *config_, product_func, accum_func); + } +} + +/** + * Computes L1 distances for sparse input. This does not have + * an equivalent expanded form, so it is only executed in + * an unexpanded form. + * @tparam value_idx + * @tparam value_t + */ +template +class l1_unexpanded_distances_t : public distances_t { + public: + l1_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} + + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), + Sum(), AtomicAdd()); + } + + private: + const distances_config_t *config_; +}; + +template +class l2_unexpanded_distances_t : public distances_t { + public: + l2_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} + + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, SqDiff(), + Sum(), AtomicAdd()); + } + + private: + const distances_config_t *config_; +}; + +template +class linf_unexpanded_distances_t : public distances_t { + public: + explicit linf_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} + + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), + Max(), AtomicMax()); + } + + private: + const distances_config_t *config_; +}; + +template +class canberra_unexpanded_distances_t : public distances_t { + public: + explicit canberra_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} + + void compute(value_t *out_dists) { + unexpanded_lp_distances( + out_dists, config_, + [] __device__(value_t a, value_t b) { + value_t d = fabs(a) + fabs(b); + + // deal with potential for 0 in denominator by + // forcing 1/0 instead + return ((d != 0) * fabs(a - b)) / (d + (d == 0)); + }, + Sum(), AtomicAdd()); + } + + private: + const distances_config_t *config_; +}; + +template +class lp_unexpanded_distances_t : public distances_t { + public: + explicit lp_unexpanded_distances_t( + const distances_config_t &config, value_t p_) + : config_(&config), p(p_) {} + + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, PDiff(p), + Sum(), AtomicAdd()); + + float one_over_p = 1.0f / p; + raft::linalg::unaryOp( + out_dists, out_dists, config_->a_nrows * config_->b_nrows, + [=] __device__(value_t input) { return pow(input, one_over_p); }, + config_->stream); + } + + private: + const distances_config_t *config_; + value_t p; +}; + +}; // END namespace distance +}; // END namespace sparse +}; // END namespace raft diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh new file mode 100644 index 0000000000..d14a42b407 --- /dev/null +++ b/cpp/include/raft/sparse/distance/operators.cuh @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace sparse { +namespace distance { + +struct Sum { + template + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + return a + b; + } +}; + +struct SqDiff { + template + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + return (a - b) * (a - b); + } +}; + +struct PDiff { + float p; + + PDiff(float p_) : p(p_) {} + + template + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + return pow(a - b, p); + } +}; + +struct Max { + template + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + return fmax(a, b); + } +}; + +struct AtomicAdd { + template + __host__ __device__ __forceinline__ value_t operator()(value_t *a, + value_t b) { + return atomicAdd(a, b); + } +}; + +struct AtomicMax { + template + __host__ __device__ __forceinline__ value_t operator()(value_t *a, + value_t b) { + return atomicMax(a, b); + } +}; + +struct Product { + template + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + return a * b; + } +}; + +struct AbsDiff { + template + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + return fabs(a - b); + } +}; +} // namespace distance +} // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh new file mode 100644 index 0000000000..bf3e93a06f --- /dev/null +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace linalg { + +template +__global__ void csr_add_calc_row_counts_kernel( + const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, + const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, + int *out_rowcounts) { + // loop through columns in each set of rows and + // calculate number of unique cols across both rows + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + if (row < m) { + int a_start_idx = a_ind[row]; + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + + int b_start_idx = b_ind[row]; + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + + /** + * Union of columns within each row of A and B so that we can scan through + * them, adding their values together. + */ + int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx); + + int *arr = new int[max_size]; + int cur_arr_idx = 0; + for (int j = a_start_idx; j < a_stop_idx; j++) { + arr[cur_arr_idx] = a_indptr[j]; + cur_arr_idx++; + } + + int arr_size = cur_arr_idx; + int final_size = arr_size; + + for (int j = b_start_idx; j < b_stop_idx; j++) { + int cur_col = b_indptr[j]; + bool found = false; + for (int k = 0; k < arr_size; k++) { + if (arr[k] == cur_col) { + found = true; + break; + } + } + + if (!found) { + final_size++; + } + } + + out_rowcounts[row] = final_size; + raft::myAtomicAdd(out_rowcounts + m, final_size); + + delete arr; + } +} + +template +__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, + const T *a_val, int nnz1, const int *b_ind, + const int *b_indptr, const T *b_val, int nnz2, + int m, int *out_ind, int *out_indptr, + T *out_val) { + // 1 thread per row + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + if (row < m) { + int a_start_idx = a_ind[row]; + + // TODO: Shouldn't need this if rowind is proper CSR + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + + int b_start_idx = b_ind[row]; + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + + int o_idx = out_ind[row]; + + int cur_o_idx = o_idx; + for (int j = a_start_idx; j < a_stop_idx; j++) { + out_indptr[cur_o_idx] = a_indptr[j]; + out_val[cur_o_idx] = a_val[j]; + cur_o_idx++; + } + + int arr_size = cur_o_idx - o_idx; + for (int j = b_start_idx; j < b_stop_idx; j++) { + int cur_col = b_indptr[j]; + bool found = false; + for (int k = o_idx; k < o_idx + arr_size; k++) { + // If we found a match, sum the two values + if (out_indptr[k] == cur_col) { + out_val[k] += b_val[j]; + found = true; + break; + } + } + + // if we didn't find a match, add the value for b + if (!found) { + out_indptr[o_idx + arr_size] = cur_col; + out_val[o_idx + arr_size] = b_val[j]; + arr_size++; + } + } + } +} + +/** + * @brief Calculate the CSR row_ind array that would result + * from summing together two CSR matrices + * @param a_ind: left hand row_ind array + * @param a_indptr: left hand index_ptr array + * @param a_val: left hand data array + * @param nnz1: size of left hand index_ptr and val arrays + * @param b_ind: right hand row_ind array + * @param b_indptr: right hand index_ptr array + * @param b_val: right hand data array + * @param nnz2: size of right hand index_ptr and val arrays + * @param m: size of output array (number of rows in final matrix) + * @param out_ind: output row_ind array + * @param d_alloc: device allocator to use for temp memory + * @param stream: cuda stream to use + */ +template +size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, + int nnz1, const int *b_ind, const int *b_indptr, + const T *b_val, int nnz2, int m, int *out_ind, + std::shared_ptr d_alloc, + cudaStream_t stream) { + dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); + CUDA_CHECK( + cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + + csr_add_calc_row_counts_kernel + <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, + b_val, nnz2, m, row_counts.data()); + + int cnnz = 0; + raft::update_host(&cnnz, row_counts.data() + m, 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // create csr compressed row index from row counts + thrust::device_ptr row_counts_d = + thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + c_ind_d); + + return cnnz; +} + +/** + * @brief Calculate the CSR row_ind array that would result + * from summing together two CSR matrices + * @param a_ind: left hand row_ind array + * @param a_indptr: left hand index_ptr array + * @param a_val: left hand data array + * @param nnz1: size of left hand index_ptr and val arrays + * @param b_ind: right hand row_ind array + * @param b_indptr: right hand index_ptr array + * @param b_val: right hand data array + * @param nnz2: size of right hand index_ptr and val arrays + * @param m: size of output array (number of rows in final matrix) + * @param c_ind: output row_ind array + * @param c_indptr: output ind_ptr array + * @param c_val: output data array + * @param stream: cuda stream to use + */ +template +void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val, + int nnz1, const int *b_ind, const int *b_indptr, + const T *b_val, int nnz2, int m, int *c_ind, + int *c_indptr, T *c_val, cudaStream_t stream) { + dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + csr_add_kernel + <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, + b_val, nnz2, m, c_ind, c_indptr, c_val); + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end NAMESPACE linalg +}; // end NAMESPACE sparse +}; // end NAMESPACE raft diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh new file mode 100644 index 0000000000..081fbbe841 --- /dev/null +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace raft { +namespace sparse { +namespace linalg { + +/** + * @brief Count all the rows in the coo row array and place them in the + * results matrix, indexed by row. + * + * @tparam TPB_X: number of threads to use per block + * @param rows the rows array of the coo matrix + * @param nnz the size of the rows array + * @param results array to place results + */ +template +__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { + int row = (blockIdx.x * TPB_X) + threadIdx.x; + if (row < nnz) { + raft::myAtomicAdd(results + rows[row], 1); + } +} + +/** + * @brief Count the number of values for each row + * @tparam TPB_X: number of threads to use per block + * @param rows: rows array of the COO matrix + * @param nnz: size of the rows array + * @param results: output result array + * @param stream: cuda stream to use + */ +template +void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { + dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); + dim3 blk_rc(TPB_X, 1, 1); + + coo_degree_kernel<<>>(rows, nnz, results); + CUDA_CHECK(cudaGetLastError()); +} + +/** + * @brief Count the number of values for each row + * @tparam TPB_X: number of threads to use per block + * @tparam T: type name of underlying values array + * @param in: input COO object for counting rows + * @param results: output array with row counts (size=in->n_rows) + * @param stream: cuda stream to use + */ +template +void coo_degree(COO *in, int *results, cudaStream_t stream) { + dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); + dim3 blk_rc(TPB_X, 1, 1); + + coo_degree_kernel + <<>>(in->rows(), in->nnz, results); + CUDA_CHECK(cudaGetLastError()); +} + +template +__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz, + int *results) { + int row = (blockIdx.x * TPB_X) + threadIdx.x; + if (row < nnz && vals[row] != 0.0) { + raft::myAtomicAdd(results + rows[row], 1); + } +} + +template +__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, + int nnz, T scalar, int *results) { + int row = (blockIdx.x * TPB_X) + threadIdx.x; + if (row < nnz && vals[row] != scalar) { + raft::myAtomicAdd(results + rows[row], 1); + } +} + +/** + * @brief Count the number of values for each row matching a particular scalar + * @tparam TPB_X: number of threads to use per block + * @tparam T: the type name of the underlying value arrays + * @param in: Input COO array + * @param scalar: scalar to match for counting rows + * @param results: output row counts + * @param stream: cuda stream to use + */ +template +void coo_degree_scalar(COO *in, T scalar, int *results, + cudaStream_t stream) { + dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); + dim3 blk_rc(TPB_X, 1, 1); + coo_degree_scalar_kernel<<>>( + in->rows(), in->vals(), in->nnz, scalar, results); + CUDA_CHECK(cudaGetLastError()); +} + +/** + * @brief Count the number of values for each row matching a particular scalar + * @tparam TPB_X: number of threads to use per block + * @tparam T: the type name of the underlying value arrays + * @param rows: Input COO row array + * @param vals: Input COO val arrays + * @param nnz: size of input COO arrays + * @param scalar: scalar to match for counting rows + * @param results: output row counts + * @param stream: cuda stream to use + */ +template +void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, + int *results, cudaStream_t stream = 0) { + dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); + dim3 blk_rc(TPB_X, 1, 1); + coo_degree_scalar_kernel + <<>>(rows, vals, nnz, scalar, results); +} + +/** + * @brief Count the number of nonzeros for each row + * @tparam TPB_X: number of threads to use per block + * @tparam T: the type name of the underlying value arrays + * @param rows: Input COO row array + * @param vals: Input COO val arrays + * @param nnz: size of input COO arrays + * @param results: output row counts + * @param stream: cuda stream to use + */ +template +void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, + cudaStream_t stream) { + dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); + dim3 blk_rc(TPB_X, 1, 1); + coo_degree_nz_kernel + <<>>(rows, vals, nnz, results); +} + +/** + * @brief Count the number of nonzero values for each row + * @tparam TPB_X: number of threads to use per block + * @tparam T: the type name of the underlying value arrays + * @param in: Input COO array + * @param results: output row counts + * @param stream: cuda stream to use + */ +template +void coo_degree_nz(COO *in, int *results, cudaStream_t stream) { + dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); + dim3 blk_rc(TPB_X, 1, 1); + + coo_degree_nz_kernel + <<>>(in->rows(), in->vals(), in->nnz, results); +} + +}; // end NAMESPACE linalg +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh new file mode 100644 index 0000000000..bfcd3fd592 --- /dev/null +++ b/cpp/include/raft/sparse/linalg/norm.cuh @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace linalg { + +template +__global__ void csr_row_normalize_l1_kernel( + // @TODO: This can be done much more parallel by + // having threads in a warp compute the sum in parallel + // over each row and then divide the values in parallel. + const int *ia, // csr row ex_scan (sorted by row) + const T *vals, int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T *result) { // output array + + // row-based matrix 1 thread per row + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + // sum all vals_arr for row and divide each val by sum + if (row < m) { + int start_idx = ia[row]; + int stop_idx = 0; + if (row < m - 1) { + stop_idx = ia[row + 1]; + } else + stop_idx = nnz; + + T sum = T(0.0); + for (int j = start_idx; j < stop_idx; j++) { + sum = sum + fabs(vals[j]); + } + + for (int j = start_idx; j < stop_idx; j++) { + if (sum != 0.0) { + T val = vals[j]; + result[j] = val / sum; + } else { + result[j] = 0.0; + } + } + } +} + +/** + * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix + * + * @param ia: row_ind array + * @param vals: data array + * @param nnz: size of data array + * @param m: size of row_ind array + * @param result: l1 normalized data array + * @param stream: cuda stream to use + */ +template +void csr_row_normalize_l1(const int *ia, // csr row ex_scan (sorted by row) + const T *vals, + int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T *result, + cudaStream_t stream) { // output array + + dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + csr_row_normalize_l1_kernel + <<>>(ia, vals, nnz, m, result); + CUDA_CHECK(cudaGetLastError()); +} + +template +__global__ void csr_row_normalize_max_kernel( + // @TODO: This can be done much more parallel by + // having threads in a warp compute the sum in parallel + // over each row and then divide the values in parallel. + const int *ia, // csr row ind array (sorted by row) + const T *vals, int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T *result) { // output array + + // row-based matrix 1 thread per row + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + // find max across columns and divide + if (row < m) { + int start_idx = ia[row]; + int stop_idx = 0; + if (row < m - 1) { + stop_idx = ia[row + 1]; + } else + stop_idx = nnz; + + T max = std::numeric_limits::min(); + for (int j = start_idx; j < stop_idx; j++) { + if (vals[j] > max) max = vals[j]; + } + + // divide nonzeros in current row by max + for (int j = start_idx; j < stop_idx; j++) { + if (max != 0.0 && max > std::numeric_limits::min()) { + T val = vals[j]; + result[j] = val / max; + } else { + result[j] = 0.0; + } + } + } +} + +/** + * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix + * + * @param ia: row_ind array + * @param vals: data array + * @param nnz: size of data array + * @param m: size of row_ind array + * @param result: l1 normalized data array + * @param stream: cuda stream to use + */ + +template +void csr_row_normalize_max(const int *ia, // csr row ind array (sorted by row) + const T *vals, + int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T *result, cudaStream_t stream) { + dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + csr_row_normalize_max_kernel + <<>>(ia, vals, nnz, m, result); + CUDA_CHECK(cudaGetLastError()); +} + +}; // end NAMESPACE linalg +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh new file mode 100644 index 0000000000..43638471ad --- /dev/null +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace raft { +namespace sparse { +namespace spectral { + +template +void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, + int nnz, int n, int n_components, T *out) { + auto stream = handle.get_stream(); + auto d_alloc = handle.get_device_allocator(); + raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); + raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); + raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); + convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), + dst_cols.data(), dst_vals.data()); + + raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); + raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); + raft::mr::device::buffer labels(d_alloc, stream, n); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + /** + * Raft spectral clustering + */ + using index_type = int; + using value_type = T; + + index_type *ro = src_offsets.data(); + index_type *ci = dst_cols.data(); + value_type *vs = dst_vals.data(); + + raft::matrix::sparse_matrix_t const r_csr_m{ + handle, ro, ci, vs, n, nnz}; + + index_type neigvs = n_components + 1; + index_type maxiter = 4000; //default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; //what cugraph is using + auto t_exe_p = thrust::cuda::par.on(stream); + using thrust_exe_policy_t = decltype(t_exe_p); + + raft::eigen_solver_config_t cfg{neigvs, maxiter, + restart_iter, tol}; + + raft::lanczos_solver_t eig_solver{cfg}; + + //cluster computation here is irrelevant, + //hence define a no-op such solver to + //feed partition(): + // + struct no_op_cluster_solver_t { + using index_type_t = index_type; + using size_type_t = index_type; + using value_type_t = value_type; + + std::pair solve( + handle_t const &handle, thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, size_type_t dim, + value_type_t const *__restrict__ obs, + index_type_t *__restrict__ codes) const { + return std::make_pair(0, 0); + } + }; + + raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, + no_op_cluster_solver_t{}, labels.data(), + eigVals.data(), eigVecs.data()); + + raft::copy(out, eigVecs.data() + n, n * n_components, stream); + + CUDA_CHECK(cudaGetLastError()); +} +}; // namespace spectral +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh new file mode 100644 index 0000000000..bb298008b7 --- /dev/null +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +namespace raft { +namespace sparse { +namespace linalg { + +// TODO: value_idx param needs to be used for this once FAISS is updated to use float32 +// for indices so that the index types can be uniform +template +__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, + T *vals, int *orows, int *ocols, T *ovals, + int n, int cnnz, Lambda reduction_op) { + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + if (row < n) { + int start_idx = row_ind[row]; // each thread processes one row + int stop_idx = get_stop_idx(row, n, cnnz, row_ind); + + int row_nnz = 0; + int out_start_idx = start_idx * 2; + + for (int idx = 0; idx < stop_idx - start_idx; idx++) { + int cur_row = rows[idx + start_idx]; + int cur_col = cols[idx + start_idx]; + T cur_val = vals[idx + start_idx]; + + int lookup_row = cur_col; + int t_start = row_ind[lookup_row]; // Start at + int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); + + T transpose = 0.0; + + bool found_match = false; + for (int t_idx = t_start; t_idx < t_stop; t_idx++) { + // If we find a match, let's get out of the loop. We won't + // need to modify the transposed value, since that will be + // done in a different thread. + if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) { + // If it exists already, set transposed value to existing value + transpose = vals[t_idx]; + found_match = true; + break; + } + } + + // Custom reduction op on value and its transpose, which enables + // specialized weighting. + // If only simple X+X.T is desired, this op can just sum + // the two values. + T res = reduction_op(cur_row, cur_col, cur_val, transpose); + + // if we didn't find an exact match, we need to add + // the computed res into our current matrix to guarantee + // symmetry. + // Note that if we did find a match, we don't need to + // compute `res` on it here because it will be computed + // in a different thread. + if (!found_match && vals[idx] != 0.0) { + orows[out_start_idx + row_nnz] = cur_col; + ocols[out_start_idx + row_nnz] = cur_row; + ovals[out_start_idx + row_nnz] = res; + ++row_nnz; + } + + if (res != 0.0) { + orows[out_start_idx + row_nnz] = cur_row; + ocols[out_start_idx + row_nnz] = cur_col; + ovals[out_start_idx + row_nnz] = res; + ++row_nnz; + } + } + } +} + +/** + * @brief takes a COO matrix which may not be symmetric and symmetrizes + * it, running a custom reduction function against the each value + * and its transposed value. + * + * @param in: Input COO matrix + * @param out: Output symmetrized COO matrix + * @param reduction_op: a custom reduction function + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void coo_symmetrize(COO *in, COO *out, + Lambda reduction_op, // two-argument reducer + std::shared_ptr d_alloc, + cudaStream_t stream) { + dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + ASSERT(!out->validate_mem(), "Expecting unallocated COO for output"); + + raft::mr::device::buffer in_row_ind(d_alloc, stream, in->n_rows); + + convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream); + + out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); + + coo_symmetrize_kernel<<>>( + in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(), + out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Find how much space needed in each row. + * We look through all datapoints and increment the count for each row. + * + * @param data: Input knn distances(n, k) + * @param indices: Input knn indices(n, k) + * @param n: Number of rows + * @param k: Number of n_neighbors + * @param row_sizes: Input empty row sum 1 array(n) + * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction + */ +template +__global__ static void symmetric_find_size(const value_t *restrict data, + const value_idx *restrict indices, + const value_idx n, const int k, + value_idx *restrict row_sizes, + value_idx *restrict row_sizes2) { + const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row + const auto j = + blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + if (row >= n || j >= k) return; + + const auto col = indices[row * k + j]; + if (j % 2) + atomicAdd(&row_sizes[col], value_idx(1)); + else + atomicAdd(&row_sizes2[col], value_idx(1)); +} + +/** + * @brief Reduce sum(row_sizes) + k + * Reduction for symmetric_find_size kernel. Allows algo to be faster. + * + * @param n: Number of rows + * @param k: Number of n_neighbors + * @param row_sizes: Input row sum 1 array(n) + * @param row_sizes2: Input row sum 2 array(n) for faster reduction + */ +template +__global__ static void reduce_find_size(const value_idx n, const int k, + value_idx *restrict row_sizes, + const value_idx *restrict row_sizes2) { + const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; + if (i >= n) return; + row_sizes[i] += (row_sizes2[i] + k); +} + +/** + * @brief Perform data + data.T operation. + * Can only run once row_sizes from the CSR matrix of data + data.T has been + * determined. + * + * @param edges: Input row sum array(n) after reduction + * @param data: Input knn distances(n, k) + * @param indices: Input knn indices(n, k) + * @param VAL: Output values for data + data.T + * @param COL: Output column indices for data + data.T + * @param ROW: Output row indices for data + data.T + * @param n: Number of rows + * @param k: Number of n_neighbors + */ +template +__global__ static void symmetric_sum(value_idx *restrict edges, + const value_t *restrict data, + const value_idx *restrict indices, + value_t *restrict VAL, + value_idx *restrict COL, + value_idx *restrict ROW, const value_idx n, + const int k) { + const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row + const auto j = + blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + if (row >= n || j >= k) return; + + const auto col = indices[row * k + j]; + const auto original = atomicAdd(&edges[row], value_idx(1)); + const auto transpose = atomicAdd(&edges[col], value_idx(1)); + + VAL[transpose] = VAL[original] = data[row * k + j]; + // Notice swapped ROW, COL since transpose + ROW[original] = row; + COL[original] = col; + + ROW[transpose] = col; + COL[transpose] = row; +} + +/** + * @brief Perform data + data.T on raw KNN data. + * The following steps are invoked: + * (1) Find how much space needed in each row + * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k + * (3) Allocate new space + * (4) Prepare edges for each new row + * (5) Perform final data + data.T operation + * (6) Return summed up VAL, COL, ROW + * + * @param knn_indices: Input knn distances(n, k) + * @param knn_dists: Input knn indices(n, k) + * @param n: Number of rows + * @param k: Number of n_neighbors + * @param out: Output COO Matrix class + * @param stream: Input cuda stream + * @param d_alloc device allocator for temporary buffers + */ +template +void from_knn_symmetrize_matrix( + const value_idx *restrict knn_indices, const value_t *restrict knn_dists, + const value_idx n, const int k, COO *out, + cudaStream_t stream, std::shared_ptr d_alloc) { + // (1) Find how much space needed in each row + // We look through all datapoints and increment the count for each row. + const dim3 threadsPerBlock(TPB_X, TPB_Y); + const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), + raft::ceildiv(k, TPB_Y)); + + // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) + raft::mr::device::buffer row_sizes(d_alloc, stream, n); + CUDA_CHECK( + cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + + raft::mr::device::buffer row_sizes2(d_alloc, stream, n); + CUDA_CHECK( + cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + + symmetric_find_size<<>>( + knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); + CUDA_CHECK(cudaPeekAtLastError()); + + reduce_find_size<<>>( + n, k, row_sizes.data(), row_sizes2.data()); + CUDA_CHECK(cudaPeekAtLastError()); + + // (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k + // Notice we don't do any merging and leave the result as 2*NNZ + const auto NNZ = 2 * n * k; + + // (3) Allocate new space + out->allocate(NNZ, n, n, true, stream); + + // (4) Prepare edges for each new row + // This mirrors CSR matrix's row Pointer, were maximum bounds for each row + // are calculated as the cumulative rolling sum of the previous rows. + // Notice reusing old row_sizes2 memory + value_idx *edges = row_sizes2.data(); + thrust::device_ptr __edges = thrust::device_pointer_cast(edges); + thrust::device_ptr __row_sizes = + thrust::device_pointer_cast(row_sizes.data()); + + // Rolling cumulative sum + thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, + __row_sizes + n, __edges); + + // (5) Perform final data + data.T operation in tandem with memcpying + symmetric_sum<<>>( + edges, knn_dists, knn_indices, out->vals(), out->cols(), out->rows(), n, k); + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end NAMESPACE linalg +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h new file mode 100644 index 0000000000..6afe4ca8f6 --- /dev/null +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace linalg { + +/** + * Transpose a set of CSR arrays into a set of CSC arrays. + * @tparam value_idx : data type of the CSR index arrays + * @tparam value_t : data type of the CSR data array + * @param[in] handle : used for invoking cusparse + * @param[in] csr_indptr : CSR row index array + * @param[in] csr_indices : CSR column indices array + * @param[in] csr_data : CSR data array + * @param[out] csc_indptr : CSC row index array + * @param[out] csc_indices : CSC column indices array + * @param[out] csc_data : CSC data array + * @param[in] csr_nrows : Number of rows in CSR + * @param[in] csr_ncols : Number of columns in CSR + * @param[in] nnz : Number of nonzeros of CSR + * @param[in] allocator : Allocator for intermediate memory + * @param[in] stream : Cuda stream for ordering events + */ +template +void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, + const value_idx *csr_indices, const value_t *csr_data, + value_idx *csc_indptr, value_idx *csc_indices, + value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, + value_idx nnz, + std::shared_ptr allocator, + cudaStream_t stream) { + size_t convert_csc_workspace_size = 0; + + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( + handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, + csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, stream)); + + raft::mr::device::buffer convert_csc_workspace( + allocator, stream, convert_csc_workspace_size); + + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( + handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, + csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), stream)); +} + +}; // end NAMESPACE linalg +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh new file mode 100644 index 0000000000..53359be57c --- /dev/null +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +namespace raft { +namespace sparse { +namespace op { + +template +__global__ void coo_remove_zeros_kernel(const int *rows, const int *cols, + const T *vals, int nnz, int *crows, + int *ccols, T *cvals, int *ex_scan, + int *cur_ex_scan, int m) { + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + if (row < m) { + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int cur_out_idx = ex_scan[row]; + + for (int idx = start; idx < stop; idx++) { + if (vals[idx] != 0.0) { + crows[cur_out_idx] = rows[idx]; + ccols[cur_out_idx] = cols[idx]; + cvals[cur_out_idx] = vals[idx]; + ++cur_out_idx; + } + } + } +} + +template +__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, + const T *vals, int nnz, int *crows, + int *ccols, T *cvals, int *ex_scan, + int *cur_ex_scan, int m, T scalar) { + int row = (blockIdx.x * TPB_X) + threadIdx.x; + + if (row < m) { + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int cur_out_idx = ex_scan[row]; + + for (int idx = start; idx < stop; idx++) { + if (vals[idx] != scalar) { + crows[cur_out_idx] = rows[idx]; + ccols[cur_out_idx] = cols[idx]; + cvals[cur_out_idx] = vals[idx]; + ++cur_out_idx; + } + } + } +} + +/** + * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix. + * + * @param rows: input array of rows (size n) + * @param cols: input array of cols (size n) + * @param vals: input array of vals (size n) + * @param nnz: size of current rows/cols/vals arrays + * @param crows: compressed array of rows + * @param ccols: compressed array of cols + * @param cvals: compressed array of vals + * @param cnnz: array of non-zero counts per row + * @param cur_cnnz array of counts per row + * @param scalar: scalar to remove from arrays + * @param n: number of rows in dense matrix + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, + int *crows, int *ccols, T *cvals, int *cnnz, + int *cur_cnnz, T scalar, int n, + std::shared_ptr d_alloc, + cudaStream_t stream) { + raft::mr::device::buffer ex_scan(d_alloc, stream, n); + raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); + + CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); + + thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); + thrust::device_ptr dev_ex_scan = + thrust::device_pointer_cast(ex_scan.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, + dev_ex_scan); + CUDA_CHECK(cudaPeekAtLastError()); + + thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); + thrust::device_ptr dev_cur_ex_scan = + thrust::device_pointer_cast(cur_ex_scan.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, + dev_cur_cnnz + n, dev_cur_ex_scan); + CUDA_CHECK(cudaPeekAtLastError()); + + dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + coo_remove_scalar_kernel<<>>( + rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(), + dev_cur_ex_scan.get(), n, scalar); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix. + * + * @param in: input COO matrix + * @param out: output COO matrix + * @param scalar: scalar to remove from arrays + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void coo_remove_scalar(COO *in, COO *out, T scalar, + std::shared_ptr d_alloc, + cudaStream_t stream) { + raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); + raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); + + CUDA_CHECK( + cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + + linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); + CUDA_CHECK(cudaPeekAtLastError()); + + linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, + row_count_nz.data(), stream); + CUDA_CHECK(cudaPeekAtLastError()); + + thrust::device_ptr d_row_count_nz = + thrust::device_pointer_cast(row_count_nz.data()); + int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, + d_row_count_nz + in->n_rows); + + out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); + + coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, + out->rows(), out->cols(), out->vals(), + row_count_nz.data(), row_count.data(), scalar, + in->n_rows, d_alloc, stream); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Removes zeros from a COO formatted sparse matrix. + * + * @param in: input COO matrix + * @param out: output COO matrix + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void coo_remove_zeros(COO *in, COO *out, + std::shared_ptr d_alloc, + cudaStream_t stream) { + coo_remove_scalar(in, out, T(0.0), d_alloc, stream); +} + +}; // namespace op +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh new file mode 100644 index 0000000000..9e5034dc28 --- /dev/null +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace op { + +template void> +__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, + Lambda op) { + T row = blockIdx.x * TPB_X + threadIdx.x; + if (row < n_rows) { + T start_idx = row_ind[row]; + T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; + op(row, start_idx, stop_idx); + } +} + +/** + * @brief Perform a custom row operation on a CSR matrix in batches. + * @tparam T numerical type of row_ind array + * @tparam TPB_X number of threads per block to use for underlying kernel + * @tparam Lambda type of custom operation function + * @param row_ind the CSR row_ind array to perform parallel operations over + * @param n_rows total number vertices in graph + * @param nnz number of non-zeros + * @param op custom row operation functor accepting the row and beginning index. + * @param stream cuda stream to use + */ +template void> +void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op, + cudaStream_t stream) { + dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1); + dim3 blk(TPB_X, 1, 1); + csr_row_op_kernel + <<>>(row_ind, n_rows, nnz, op); + + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // namespace op +}; // end NAMESPACE sparse +}; // end NAMESPACE raft diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h new file mode 100644 index 0000000000..46f4f41879 --- /dev/null +++ b/cpp/include/raft/sparse/op/slice.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace op { + +/** + * Slice consecutive rows from a CSR array and populate newly sliced indptr array + * @tparam value_idx + * @param[in] start_row : beginning row to slice + * @param[in] stop_row : ending row to slice + * @param[in] indptr : indptr of input CSR to slice + * @param[out] indptr_out : output sliced indptr to populate + * @param[in] start_offset : beginning column offset of input indptr + * @param[in] stop_offset : ending column offset of input indptr + * @param[in] stream : cuda stream for ordering events + */ +template +void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, + const value_idx *indptr, value_idx *indptr_out, + value_idx *start_offset, value_idx *stop_offset, + cudaStream_t stream) { + raft::update_host(start_offset, indptr + start_row, 1, stream); + raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + value_idx s_offset = *start_offset; + + // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, + // we add another 1 to stop row. + raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, + stream); + + raft::linalg::unaryOp( + indptr_out, indptr_out, (stop_row + 2) - start_row, + [s_offset] __device__(value_idx input) { return input - s_offset; }, + stream); +} + +/** + * Slice rows from a CSR, populate column and data arrays + * @tparam[in] value_idx : data type of CSR index arrays + * @tparam[in] value_t : data type of CSR data array + * @param[in] start_offset : beginning column offset to slice + * @param[in] stop_offset : ending column offset to slice + * @param[in] indices : column indices array from input CSR + * @param[in] data : data array from input CSR + * @param[out] indices_out : output column indices array + * @param[out] data_out : output data array + * @param[in] stream : cuda stream for ordering events + */ +template +void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset, + const value_idx *indices, const value_t *data, + value_idx *indices_out, value_t *data_out, + cudaStream_t stream) { + raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, + stream); + raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream); +} + +}; // namespace op +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h new file mode 100644 index 0000000000..b039e52517 --- /dev/null +++ b/cpp/include/raft/sparse/op/sort.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace raft { +namespace sparse { +namespace op { + +/** + * @brief Sorts the arrays that comprise the coo matrix + * by row. + * + * @param m number of rows in coo matrix + * @param n number of cols in coo matrix + * @param nnz number of non-zeros + * @param rows rows array from coo matrix + * @param cols cols array from coo matrix + * @param vals vals array from coo matrix + * @param d_alloc device allocator for temporary buffers + * @param stream: cuda stream to use + */ +template +void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, + std::shared_ptr d_alloc, + cudaStream_t stream) { + cusparseHandle_t handle = NULL; + + size_t pBufferSizeInBytes = 0; + + CUSPARSE_CHECK(cusparseCreate(&handle)); + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, rows, cols, + &pBufferSizeInBytes)); + + raft::mr::device::buffer d_P(d_alloc, stream, nnz); + raft::mr::device::buffer pBuffer(d_alloc, stream, pBufferSizeInBytes); + + CUSPARSE_CHECK(cusparseCreateIdentityPermutation(handle, nnz, d_P.data())); + + CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, rows, cols, + d_P.data(), pBuffer.data())); + + raft::mr::device::buffer vals_sorted(d_alloc, stream, nnz); + + CUSPARSE_CHECK(raft::sparse::cusparsegthr( + handle, nnz, vals, vals_sorted.data(), d_P.data(), stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + raft::copy(vals, vals_sorted.data(), nnz, stream); + + CUSPARSE_CHECK(cusparseDestroy(handle)); +} + +/** + * @brief Sort the underlying COO arrays by row + * @tparam T: the type name of the underlying value array + * @param in: COO to sort by row + * @param d_alloc device allocator for temporary buffers + * @param stream: the cuda stream to use + */ +template +void coo_sort(COO *const in, + std::shared_ptr d_alloc, + cudaStream_t stream) { + coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), + in->vals(), d_alloc, stream); +} +}; // namespace op +}; // end NAMESPACE sparse +}; // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh new file mode 100644 index 0000000000..3e8fa2bd6f --- /dev/null +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +#include + +namespace raft { +namespace sparse { +namespace selection { + +template +struct csr_batcher_t { + csr_batcher_t(value_idx batch_size, value_idx n_rows, + const value_idx *csr_indptr, const value_idx *csr_indices, + const value_t *csr_data) + : batch_start_(0), + batch_stop_(0), + batch_rows_(0), + total_rows_(n_rows), + batch_size_(batch_size), + csr_indptr_(csr_indptr), + csr_indices_(csr_indices), + csr_data_(csr_data), + batch_csr_start_offset_(0), + batch_csr_stop_offset_(0) {} + + void set_batch(int batch_num) { + batch_start_ = batch_num * batch_size_; + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + + if (batch_stop_ >= total_rows_) + batch_stop_ = total_rows_ - 1; // zero-based indexing + + batch_rows_ = (batch_stop_ - batch_start_) + 1; + } + + value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, + cudaStream_t stream) { + raft::sparse::op::csr_row_slice_indptr( + batch_start_, batch_stop_, csr_indptr_, batch_indptr, + &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); + + return batch_csr_stop_offset_ - batch_csr_start_offset_; + } + + void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, + cudaStream_t stream) { + raft::sparse::op::csr_row_slice_populate( + batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, + csr_indices, csr_data, stream); + } + + value_idx batch_rows() const { return batch_rows_; } + + value_idx batch_start() const { return batch_start_; } + + value_idx batch_stop() const { return batch_stop_; } + + private: + value_idx batch_size_; + value_idx batch_start_; + value_idx batch_stop_; + value_idx batch_rows_; + + value_idx total_rows_; + + const value_idx *csr_indptr_; + const value_idx *csr_indices_; + const value_t *csr_data_; + + value_idx batch_csr_start_offset_; + value_idx batch_csr_stop_offset_; +}; + +template +class sparse_knn_t { + public: + sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, + const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, + int n_idx_cols_, const value_idx *queryIndptr_, + const value_idx *queryIndices_, const value_t *queryData_, + size_t queryNNZ_, int n_query_rows_, int n_query_cols_, + value_idx *output_indices_, value_t *output_dists_, int k_, + cusparseHandle_t cusparseHandle_, + std::shared_ptr allocator_, + cudaStream_t stream_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + raft::distance::DistanceType metric_ = + raft::distance::DistanceType::L2Expanded, + float metricArg_ = 0, bool expanded_form_ = false) + : idxIndptr(idxIndptr_), + idxIndices(idxIndices_), + idxData(idxData_), + idxNNZ(idxNNZ_), + n_idx_rows(n_idx_rows_), + n_idx_cols(n_idx_cols_), + queryIndptr(queryIndptr_), + queryIndices(queryIndices_), + queryData(queryData_), + queryNNZ(queryNNZ_), + n_query_rows(n_query_rows_), + n_query_cols(n_query_cols_), + output_indices(output_indices_), + output_dists(output_dists_), + k(k_), + cusparseHandle(cusparseHandle_), + allocator(allocator_), + stream(stream_), + batch_size_index(batch_size_index_), + batch_size_query(batch_size_query_), + metric(metric_), + metricArg(metricArg_), + expanded_form(expanded_form_) {} + + void run() { + using namespace raft::sparse; + + int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); + csr_batcher_t query_batcher( + batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData); + + size_t rows_processed = 0; + + for (int i = 0; i < n_batches_query; i++) { + /** + * Compute index batch info + */ + query_batcher.set_batch(i); + + /** + * Slice CSR to rows in batch + */ + + raft::mr::device::buffer query_batch_indptr( + allocator, stream, query_batcher.batch_rows() + 1); + + value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( + query_batch_indptr.data(), stream); + + raft::mr::device::buffer query_batch_indices( + allocator, stream, n_query_batch_nnz); + raft::mr::device::buffer query_batch_data(allocator, stream, + n_query_batch_nnz); + + query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), + query_batch_data.data(), stream); + + // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent + // batches and 1 space for the results of the merge, which get copied back to the top + raft::mr::device::buffer merge_buffer_indices(allocator, + stream, 0); + raft::mr::device::buffer merge_buffer_dists(allocator, stream, + 0); + + value_t *dists_merge_buffer_ptr; + value_idx *indices_merge_buffer_ptr; + + int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); + csr_batcher_t idx_batcher( + batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData); + + for (int j = 0; j < n_batches_idx; j++) { + idx_batcher.set_batch(j); + + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, stream); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, stream); + + /** + * Slice CSR to rows in batch + */ + raft::mr::device::buffer idx_batch_indptr( + allocator, stream, idx_batcher.batch_rows() + 1); + raft::mr::device::buffer idx_batch_indices(allocator, stream, + 0); + raft::mr::device::buffer idx_batch_data(allocator, stream, 0); + + value_idx idx_batch_nnz = + idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), stream); + + idx_batch_indices.resize(idx_batch_nnz, stream); + idx_batch_data.resize(idx_batch_nnz, stream); + + idx_batcher.get_batch_csr_indices_data(idx_batch_indices.data(), + idx_batch_data.data(), stream); + + /** + * Compute distances + */ + size_t dense_size = + idx_batcher.batch_rows() * query_batcher.batch_rows(); + raft::mr::device::buffer batch_dists(allocator, stream, + dense_size); + + CUDA_CHECK(cudaMemset(batch_dists.data(), 0, + batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, query_batcher, idx_batch_nnz, + n_query_batch_nnz, idx_batch_indptr.data(), + idx_batch_indices.data(), idx_batch_data.data(), + query_batch_indptr.data(), query_batch_indices.data(), + query_batch_data.data(), batch_dists.data()); + + idx_batch_indptr.release(stream); + idx_batch_indices.release(stream); + idx_batch_data.release(stream); + + // Build batch indices array + raft::mr::device::buffer batch_indices(allocator, stream, + batch_dists.size()); + + // populate batch indices array + value_idx batch_rows = query_batcher.batch_rows(), + batch_cols = idx_batcher.batch_rows(); + + iota_fill(batch_indices.data(), batch_rows, batch_cols, stream); + + /** + * Perform k-selection on batch & merge with other k-selections + */ + size_t merge_buffer_offset = batch_rows * k; + dists_merge_buffer_ptr = + merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = + merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), + batch_indices.data(), dists_merge_buffer_ptr, + indices_merge_buffer_ptr); + + perform_postprocessing(dists_merge_buffer_ptr, batch_rows); + + value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + + // Merge results of difference batches if necessary + if (idx_batcher.batch_start() > 0) { + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = + merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = + merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), + merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, + indices_merge_buffer_tmp_ptr); + } + + // copy merged output back into merge buffer partition for next iteration + raft::copy_async(merge_buffer_indices.data(), + indices_merge_buffer_tmp_ptr, + batch_rows * k, stream); + raft::copy_async(merge_buffer_dists.data(), + dists_merge_buffer_tmp_ptr, batch_rows * k, + stream); + } + + // Copy final merged batch to output array + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, stream); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, stream); + + rows_processed += query_batcher.batch_rows(); + } + } + + void perform_postprocessing(value_t *dists, size_t batch_rows) { + // Perform necessary post-processing + if (metric == raft::distance::DistanceType::L2Expanded && !expanded_form) { + /** + * post-processing + */ + value_t p = 0.5; // standard l2 + raft::linalg::unaryOp( + dists, dists, batch_rows * k, + [p] __device__(value_t input) { + int neg = input < 0 ? -1 : 1; + return powf(fabs(input), p) * neg; + }, + stream); + } + } + + private: + void merge_batches(csr_batcher_t &idx_batcher, + csr_batcher_t &query_batcher, + value_t *merge_buffer_dists, + value_idx *merge_buffer_indices, value_t *out_dists, + value_idx *out_indices) { + // build translation buffer to shift resulting indices by the batch + std::vector id_ranges; + id_ranges.push_back(0); + id_ranges.push_back(idx_batcher.batch_start()); + + raft::mr::device::buffer trans(allocator, stream, + id_ranges.size()); + raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), + stream); + + // combine merge buffers only if there's more than 1 partition to combine + raft::spatial::knn::detail::knn_merge_parts( + merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, + query_batcher.batch_rows(), 2, k, stream, trans.data()); + } + + void perform_k_selection(csr_batcher_t idx_batcher, + csr_batcher_t query_batcher, + value_t *batch_dists, value_idx *batch_indices, + value_t *out_dists, value_idx *out_indices) { + // populate batch indices array + value_idx batch_rows = query_batcher.batch_rows(), + batch_cols = idx_batcher.batch_rows(); + + // build translation buffer to shift resulting indices by the batch + std::vector id_ranges; + id_ranges.push_back(0); + id_ranges.push_back(idx_batcher.batch_start()); + + // in the case where the number of idx rows in the batch is < k, we + // want to adjust k. + value_idx n_neighbors = min(k, batch_cols); + + bool ascending = true; + if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; + + // kernel to slice first (min) k cols and copy into batched merge buffer + select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists, + out_indices, ascending, n_neighbors, stream); + } + + void compute_distances(csr_batcher_t &idx_batcher, + csr_batcher_t &query_batcher, + size_t idx_batch_nnz, size_t query_batch_nnz, + value_idx *idx_batch_indptr, + value_idx *idx_batch_indices, value_t *idx_batch_data, + value_idx *query_batch_indptr, + value_idx *query_batch_indices, + value_t *query_batch_data, value_t *batch_dists) { + /** + * Compute distances + */ + raft::sparse::distance::distances_config_t dist_config; + dist_config.b_nrows = idx_batcher.batch_rows(); + dist_config.b_ncols = n_idx_cols; + dist_config.b_nnz = idx_batch_nnz; + + dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indices = idx_batch_indices; + dist_config.b_data = idx_batch_data; + + dist_config.a_nrows = query_batcher.batch_rows(); + dist_config.a_ncols = n_query_cols; + dist_config.a_nnz = query_batch_nnz; + + dist_config.a_indptr = query_batch_indptr; + dist_config.a_indices = query_batch_indices; + dist_config.a_data = query_batch_data; + + dist_config.handle = cusparseHandle; + dist_config.allocator = allocator; + dist_config.stream = stream; + + raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, + metricArg); + } + + const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; + value_idx *output_indices; + const value_t *idxData, *queryData; + value_t *output_dists; + + size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; + + raft::distance::DistanceType metric; + + float metricArg; + + bool expanded_form; + + int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; + + cusparseHandle_t cusparseHandle; + + std::shared_ptr allocator; + + cudaStream_t stream; +}; + +/** + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNA number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] cusparseHandle the initialized cusparseHandle instance to use + * @param[in] allocator device allocator instance to use + * @param[in] stream CUDA stream to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + * @param[in] expanded_form whether or not Lp variants should be reduced by the pth-root + */ +template +void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, + const value_t *idxData, size_t idxNNZ, int n_idx_rows, + int n_idx_cols, const value_idx *queryIndptr, + const value_idx *queryIndices, const value_t *queryData, + size_t queryNNZ, int n_query_rows, int n_query_cols, + value_idx *output_indices, value_t *output_dists, int k, + cusparseHandle_t cusparseHandle, + std::shared_ptr allocator, + cudaStream_t stream, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2Expanded, + float metricArg = 0, bool expanded_form = false) { + sparse_knn_t( + idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, + queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, + output_indices, output_dists, k, cusparseHandle, allocator, stream, + batch_size_index, batch_size_query, metric, metricArg, expanded_form) + .run(); +} + +}; // namespace selection +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh new file mode 100644 index 0000000000..6066a36289 --- /dev/null +++ b/cpp/include/raft/sparse/selection/selection.cuh @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace raft { +namespace sparse { +namespace selection { + +template +__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, + size_t n_cols, K *outK, IndexType *outV, + K initK, IndexType initV, int k) { + constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; + + __shared__ K smemK[kNumWarps * warp_q]; + __shared__ IndexType smemV[kNumWarps * warp_q]; + + faiss::gpu::BlockSelect, + warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); + + // Grid is exactly sized to rows available + int row = blockIdx.x; + int i = threadIdx.x; + + int idx = row * n_cols; + K *inKStart = inK + idx + i; + IndexType *inVStart = inV + idx + i; + + // Whole warps must participate in the selection + int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize); + + for (; i < limit; i += tpb) { + inKStart = inK + idx + i; + inVStart = inV + idx + i; + + heap.add(*inKStart, *inVStart); + } + + // Handle last remainder fraction of a warp of elements + if (i < n_cols) { + inKStart = inK + idx + i; + inVStart = inV + idx + i; + heap.addThreadQ(*inKStart, *inVStart); + } + + heap.reduce(); + + for (int i = threadIdx.x; i < k; i += tpb) { + outK[row * k + i] = smemK[i]; + outV[row * k + i] = smemV[i]; + } +} + +template +inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, + size_t n_cols, value_t *outK, value_idx *outV, + bool select_min, int k, cudaStream_t stream) { + auto grid = dim3(n_rows); + + constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; + auto block = dim3(n_threads); + + auto kInit = select_min ? faiss::gpu::Limits::getMax() + : faiss::gpu::Limits::getMin(); + auto vInit = -1; + if (select_min) { + select_k_kernel + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, + vInit, k); + } else { + select_k_kernel + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, + vInit, k); + } + CUDA_CHECK(cudaGetLastError()); +} + +/** + * @brief Select the k-nearest neighbors from dense + * distance and index matrices. + * + * @param[in] inK partitioned knn distance matrix + * @param[in] inV partitioned knn index matrix + * @param[in] n_rows number of rows in distance and index matrices + * @param[in] n_cols number of columns in distance and index matrices + * @param[out] outK merged knn distance matrix + * @param[out] outV merged knn index matrix + * @param[in] select_min whether to select the min or the max distances + * @param[in] k number of neighbors per partition (also number of merged neighbors) + * @param[in] stream CUDA stream to use + */ +template +inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, + value_t *outK, value_idx *outV, bool select_min, int k, + cudaStream_t stream) { + if (k == 1) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); + else if (k <= 32) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); + else if (k <= 64) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); + else if (k <= 128) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); + else if (k <= 256) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); + else if (k <= 512) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); + else if (k <= 1024) + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); +} + +}; // namespace selection +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h new file mode 100644 index 0000000000..63578bf1f3 --- /dev/null +++ b/cpp/include/raft/sparse/utils.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft { +namespace sparse { + +/** + * Quantizes ncols to a valid blockdim, which is + * a multiple of 32. + * + * @param[in] ncols number of blocks to quantize + */ +template +inline int block_dim(value_idx ncols) { + int blockdim; + if (ncols <= 32) + blockdim = 32; + else if (ncols <= 64) + blockdim = 64; + else if (ncols <= 128) + blockdim = 128; + else if (ncols <= 256) + blockdim = 256; + else if (ncols <= 512) + blockdim = 512; + else + blockdim = 1024; + + return blockdim; +} + +// add similar semantics for __match_any_sync pre-volta (SM_70) +#if __CUDA_ARCH__ < 700 +/** + * Returns a warp-level mask with 1's for all the threads + * in the current warp that have the same key. + * @tparam G + * @param key + * @return + */ +template +__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, + G key) { + unsigned int mask = __ballot_sync(init_mask, true); + unsigned int peer_group = 0; + bool is_peer; + + do { + // fetch key of first unclaimed lane and compare with this key + is_peer = (key == __shfl_sync(mask, key, __ffs(mask) - 1)); + + // determine which lanes had a match + peer_group = __ballot_sync(mask, is_peer); + + // remove lanes with matching keys from the pool + mask = mask ^ peer_group; + + // quit if we had a match + } while (!is_peer); + + return peer_group; +} +#endif + +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) { + return __ffs(peer_group) - 1; +} + +template +__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { + int row = blockIdx.x; + int tid = threadIdx.x; + + for (int i = tid; i < ncols; i += blockDim.x) { + indices[row * ncols + i] = i; + } +} + +template +void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols, + cudaStream_t stream) { + int blockdim = block_dim(ncols); + + iota_fill_block_kernel<<>>(indices, ncols); +} + +template +__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) { + int stop_idx = 0; + if (row < (m - 1)) + stop_idx = ind[row + 1]; + else + stop_idx = nnz; + + return stop_idx; +} + +}; // namespace sparse +}; // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index ccee635701..5b77239dac 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -50,7 +50,7 @@ using deviceAllocator = raft::mr::device::allocator; * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ -void brute_force_knn( +inline void brute_force_knn( raft::handle_t &handle, std::vector &input, std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false, bool rowMajorQuery = false, diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu new file mode 100644 index 0000000000..713708d4cd --- /dev/null +++ b/cpp/test/sparse/add.cu @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include "../test_utils.h" + +#include +#include + +namespace raft { +namespace sparse { + +template +struct CSRMatrixVal { + std::vector row_ind; + std::vector row_ind_ptr; + std::vector values; +}; + +template +struct CSRAddInputs { + CSRMatrixVal matrix_a; + CSRMatrixVal matrix_b; + CSRMatrixVal matrix_verify; +}; + +template +class CSRAddTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + n_rows = params.matrix_a.row_ind.size(); + nnz_a = params.matrix_a.row_ind_ptr.size(); + nnz_b = params.matrix_b.row_ind_ptr.size(); + nnz_result = params.matrix_verify.row_ind_ptr.size(); + + cudaStreamCreate(&stream); + + raft::allocate(ind_a, n_rows); + raft::allocate(ind_ptr_a, nnz_a); + raft::allocate(values_a, nnz_a); + + raft::allocate(ind_b, n_rows); + raft::allocate(ind_ptr_b, nnz_b); + raft::allocate(values_b, nnz_b); + + raft::allocate(ind_verify, n_rows); + raft::allocate(ind_ptr_verify, nnz_result); + raft::allocate(values_verify, nnz_result); + + raft::allocate(ind_result, n_rows); + raft::allocate(ind_ptr_result, nnz_result); + raft::allocate(values_result, nnz_result); + } + + void Run() { + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + + raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); + raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, + stream); + raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream); + + raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream); + raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, + stream); + raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream); + + raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, + stream); + raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), + nnz_result, stream); + raft::update_device(values_verify, params.matrix_verify.values.data(), + nnz_result, stream); + + Index_ nnz = linalg::csr_add_calc_inds( + ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, + n_rows, ind_result, alloc, stream); + + ASSERT_TRUE(nnz == nnz_result); + ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, + raft::Compare())); + + linalg::csr_add_finalize( + ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, + n_rows, ind_result, ind_ptr_result, values_result, stream); + + ASSERT_TRUE(raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(values_verify, values_result, nnz, + raft::Compare())); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(ind_a)); + CUDA_CHECK(cudaFree(ind_b)); + CUDA_CHECK(cudaFree(ind_result)); + CUDA_CHECK(cudaFree(ind_ptr_a)); + CUDA_CHECK(cudaFree(ind_ptr_b)); + CUDA_CHECK(cudaFree(ind_ptr_verify)); + CUDA_CHECK(cudaFree(ind_ptr_result)); + CUDA_CHECK(cudaFree(values_a)); + CUDA_CHECK(cudaFree(values_b)); + CUDA_CHECK(cudaFree(values_verify)); + CUDA_CHECK(cudaFree(values_result)); + cudaStreamDestroy(stream); + } + + protected: + CSRAddInputs params; + cudaStream_t stream; + Index_ n_rows, nnz_a, nnz_b, nnz_result; + Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, + *ind_ptr_verify, *ind_ptr_result; + Type_f *values_a, *values_b, *values_verify, *values_result; +}; + +using CSRAddTestF = CSRAddTest; +TEST_P(CSRAddTestF, Result) { Run(); } + +using CSRAddTestD = CSRAddTest; +TEST_P(CSRAddTestD, Result) { Run(); } + +const std::vector> csradd_inputs_f = { + {{{0, 4, 8, 9}, + {1, 2, 3, 4, 1, 2, 3, 5, 0, 1}, + {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}}, + {{0, 4, 8, 9}, + {1, 2, 5, 4, 0, 2, 3, 5, 1, 0}, + {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}}, + {{0, 5, 10, 12}, + {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0}, + {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, +}; +const std::vector> csradd_inputs_d = { + {{{0, 4, 8, 9}, + {1, 2, 3, 4, 1, 2, 3, 5, 0, 1}, + {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}}, + {{0, 4, 8, 9}, + {1, 2, 5, 4, 0, 2, 3, 5, 1, 0}, + {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}}, + {{0, 5, 10, 12}, + {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0}, + {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, +}; + +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, + ::testing::ValuesIn(csradd_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, + ::testing::ValuesIn(csradd_inputs_d)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu new file mode 100644 index 0000000000..ea69ecfc53 --- /dev/null +++ b/cpp/test/sparse/convert_coo.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include "../test_utils.h" + +#include +#include + +namespace raft { +namespace sparse { + +template +struct CSRtoCOOInputs { + std::vector ex_scan; + std::vector verify; +}; + +template +class CSRtoCOOTest : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + + cudaStreamCreate(&stream); + raft::allocate(ex_scan, params.ex_scan.size()); + raft::allocate(verify, params.verify.size()); + raft::allocate(result, params.verify.size(), true); + } + + void Run() { + Index_ n_rows = params.ex_scan.size(); + Index_ nnz = params.verify.size(); + + raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); + raft::update_device(verify, params.verify.data(), nnz, stream); + + convert::csr_to_coo(ex_scan, n_rows, result, nnz, stream); + + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, + raft::Compare(), stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(ex_scan)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + protected: + CSRtoCOOInputs params; + cudaStream_t stream; + Index_ *ex_scan, *verify, *result; +}; + +using CSRtoCOOTestI = CSRtoCOOTest; +TEST_P(CSRtoCOOTestI, Result) { Run(); } + +using CSRtoCOOTestL = CSRtoCOOTest; +TEST_P(CSRtoCOOTestL, Result) { Run(); } + +const std::vector> csrtocoo_inputs_32 = { + {{0, 0, 2, 2}, {1, 1, 3}}, + {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, +}; +const std::vector> csrtocoo_inputs_64 = { + {{0, 0, 2, 2}, {1, 1, 3}}, + {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, +}; + +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI, + ::testing::ValuesIn(csrtocoo_inputs_32)); +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL, + ::testing::ValuesIn(csrtocoo_inputs_64)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu new file mode 100644 index 0000000000..553ef2ddee --- /dev/null +++ b/cpp/test/sparse/convert_csr.cu @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../test_utils.h" + +#include +#include +#include + +#include + +namespace raft { +namespace sparse { + +/**************************** sorted COO to CSR ****************************/ + +template +struct SparseConvertCSRInputs { + int m, n, nnz; + unsigned long long int seed; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const SparseConvertCSRInputs &dims) { + return os; +} + +template +class SparseConvertCSRTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override {} + + void TearDown() override {} + + protected: + SparseConvertCSRInputs params; +}; + +const std::vector> inputsf = { + {5, 10, 5, 1234ULL}}; + +typedef SparseConvertCSRTest SortedCOOToCSR; +TEST_P(SortedCOOToCSR, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + + int nnz = 8; + + int *in, *out, *exp; + + int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int *exp_h = new int[4]{0, 2, 4, 6}; + + raft::allocate(in, nnz, true); + raft::allocate(exp, 4, true); + raft::allocate(out, 4, true); + + raft::update_device(in, in_h, nnz, stream); + raft::update_device(exp, exp_h, 4, stream); + + convert::sorted_coo_to_csr(in, nnz, out, 4, alloc, stream); + + ASSERT_TRUE(raft::devArrMatch(out, exp, 4, raft::Compare())); + + cudaStreamDestroy(stream); + + delete[] in_h; + delete[] exp_h; + + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(exp)); + CUDA_CHECK(cudaFree(out)); +} + +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, + ::testing::ValuesIn(inputsf)); + +/******************************** adj graph ********************************/ + +template +struct CSRAdjGraphInputs { + Index_ n_rows; + Index_ n_cols; + std::vector row_ind; + std::vector adj; // To avoid vector optimization + std::vector verify; +}; + +template +class CSRAdjGraphTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + cudaStreamCreate(&stream); + nnz = params.verify.size(); + + raft::allocate(row_ind, params.n_rows); + raft::allocate(adj, params.n_rows * params.n_cols); + raft::allocate(result, nnz, true); + raft::allocate(verify, nnz); + } + + void Run() { + raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream); + raft::update_device(adj, reinterpret_cast(params.adj.data()), + params.n_rows * params.n_cols, stream); + raft::update_device(verify, params.verify.data(), nnz, stream); + + convert::csr_adj_graph_batched( + row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream); + + ASSERT_TRUE( + raft::devArrMatch(verify, result, nnz, raft::Compare())); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(row_ind)); + CUDA_CHECK(cudaFree(adj)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); + cudaStreamDestroy(stream); + } + + protected: + CSRAdjGraphInputs params; + cudaStream_t stream; + Index_ nnz; + Index_ *row_ind, *result, *verify; + bool *adj; +}; + +using CSRAdjGraphTestI = CSRAdjGraphTest; +TEST_P(CSRAdjGraphTestI, Result) { Run(); } + +using CSRAdjGraphTestL = CSRAdjGraphTest; +TEST_P(CSRAdjGraphTestL, Result) { Run(); } + +const std::vector> csradjgraph_inputs_i = { + {3, + 6, + {0, 3, 6}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 2, 0, 1, 2, 0, 1, 2}}, +}; +const std::vector> csradjgraph_inputs_l = { + {3, + 6, + {0, 3, 6}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 2, 0, 1, 2, 0, 1, 2}}, +}; + +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI, + ::testing::ValuesIn(csradjgraph_inputs_i)); +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL, + ::testing::ValuesIn(csradjgraph_inputs_l)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu new file mode 100644 index 0000000000..625772a842 --- /dev/null +++ b/cpp/test/sparse/csr_row_slice.cu @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +#include "../test_utils.h" + +namespace raft { +namespace sparse { + +using namespace raft; +using namespace raft::sparse; + +template +struct CSRRowSliceInputs { + value_idx start_row; + value_idx stop_row; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_indptr_ref_h; + std::vector out_indices_ref_h; + std::vector out_data_ref_h; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const CSRRowSliceInputs &dims) { + return os; +} + +template +class CSRRowSliceTest + : public ::testing::TestWithParam> { + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indices_ref_h = params.out_indices_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; + + allocate(out_indptr_ref, out_indptr_ref_h.size()); + allocate(out_indices_ref, out_indices_ref_h.size()); + allocate(out_data_ref, out_data_ref_h.size()); + + update_device(out_indptr_ref, out_indptr_ref_h.data(), + out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), + stream); + + allocate(out_indptr, out_indptr_ref_h.size()); + allocate(out_indices, out_indices_ref_h.size()); + allocate(out_data, out_data_ref_h.size()); + } + + void SetUp() override { + params = ::testing::TestWithParam< + CSRRowSliceInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + + make_data(); + + int csr_start_offset; + int csr_stop_offset; + + raft::sparse::op::csr_row_slice_indptr( + params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset, + &csr_stop_offset, stream); + + raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset, + indices, data, out_indices, + out_data, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out_indptr)); + CUDA_CHECK(cudaFree(out_indices)); + CUDA_CHECK(cudaFree(out_data)); + CUDA_CHECK(cudaFree(out_indptr_ref)); + CUDA_CHECK(cudaFree(out_indices_ref)); + CUDA_CHECK(cudaFree(out_data_ref)); + } + + void compare() { + ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, + params.out_indptr_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, + params.out_indices_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_data, out_data_ref, + params.out_data_ref_h.size(), Compare())); + } + + protected: + cudaStream_t stream; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_idx *out_indptr, *out_indices; + value_t *out_data; + + // expected output data + value_idx *out_indptr_ref, *out_indices_ref; + value_t *out_data_ref; + + CSRRowSliceInputs params; +}; + +const std::vector> inputs_i32_f = { + {1, + 3, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + {0, 2, 4, 6}, + {0, 1, 0, 1, 0, 1}, // indices + {1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}}, + { + 2, + 3, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + {0, 2, 4}, + {0, 1, 0, 1}, // indices + {50.0f, 28.0f, 16.0f, 2.0f}, + } + +}; +typedef CSRRowSliceTest CSRRowSliceTestF; +TEST_P(CSRRowSliceTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu new file mode 100644 index 0000000000..5535df4fe3 --- /dev/null +++ b/cpp/test/sparse/csr_to_dense.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include "../test_utils.h" + +namespace raft { +namespace sparse { + +using namespace raft; +using namespace raft::sparse; + +template +struct CSRToDenseInputs { + value_idx nrows; + value_idx ncols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_ref_h; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const CSRToDenseInputs &dims) { + return os; +} + +template +class CSRToDenseTest + : public ::testing::TestWithParam> { + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_ref_h = params.out_ref_h; + + allocate(out_ref, out_ref_h.size()); + + update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream); + + allocate(out, out_ref_h.size()); + } + + void SetUp() override { + params = ::testing::TestWithParam< + CSRToDenseInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + CUSPARSE_CHECK(cusparseCreate(&handle)); + + make_data(); + + convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices, + data, params.nrows, out, stream, true); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUSPARSE_CHECK(cusparseDestroy(handle)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaFree(out_ref)); + } + + void compare() { + ASSERT_TRUE( + devArrMatch(out, out_ref, params.out_ref_h.size(), Compare())); + } + + protected: + cudaStream_t stream; + cusparseHandle_t handle; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_t *out; + + // expected output data + value_t *out_ref; + + CSRToDenseInputs params; +}; + +const std::vector> inputs_i32_f = { + {4, + 4, + {0, 2, 4, 6, 8}, + {0, 1, 2, 3, 0, 1, 2, 3}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 16.0f, 2.0f}}, +}; +typedef CSRToDenseTest CSRToDenseTestF; +TEST_P(CSRToDenseTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu new file mode 100644 index 0000000000..c257d6eb3c --- /dev/null +++ b/cpp/test/sparse/csr_transpose.cu @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +#include "../test_utils.h" + +namespace raft { +namespace sparse { + +using namespace raft; +using namespace raft::sparse; + +template +struct CSRTransposeInputs { + value_idx nrows; + value_idx ncols; + value_idx nnz; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_indptr_ref_h; + std::vector out_indices_ref_h; + std::vector out_data_ref_h; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const CSRTransposeInputs &dims) { + return os; +} + +template +class CSRTransposeTest + : public ::testing::TestWithParam> { + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indices_ref_h = params.out_indices_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; + + allocate(out_indptr_ref, out_indptr_ref_h.size()); + allocate(out_indices_ref, out_indices_ref_h.size()); + allocate(out_data_ref, out_data_ref_h.size()); + + update_device(out_indptr_ref, out_indptr_ref_h.data(), + out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), + stream); + + allocate(out_indptr, out_indptr_ref_h.size()); + allocate(out_indices, out_indices_ref_h.size()); + allocate(out_data, out_data_ref_h.size()); + } + + void SetUp() override { + params = ::testing::TestWithParam< + CSRTransposeInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + CUSPARSE_CHECK(cusparseCreate(&handle)); + + make_data(); + + raft::sparse::linalg::csr_transpose( + handle, indptr, indices, data, out_indptr, out_indices, out_data, + params.nrows, params.ncols, params.nnz, alloc, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUSPARSE_CHECK(cusparseDestroy(handle)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out_indptr)); + CUDA_CHECK(cudaFree(out_indices)); + CUDA_CHECK(cudaFree(out_data)); + CUDA_CHECK(cudaFree(out_indptr_ref)); + CUDA_CHECK(cudaFree(out_indices_ref)); + CUDA_CHECK(cudaFree(out_data_ref)); + } + + void compare() { + ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, + params.out_indptr_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, + params.out_indices_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_data, out_data_ref, + params.out_data_ref_h.size(), Compare())); + } + + protected: + cudaStream_t stream; + cusparseHandle_t handle; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_idx *out_indptr, *out_indices; + value_t *out_data; + + // expected output data + value_idx *out_indptr_ref, *out_indices_ref; + value_t *out_data_ref; + + CSRTransposeInputs params; +}; + +const std::vector> inputs_i32_f = { + { + 4, + 2, + 8, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + {0, 4, 8}, + {0, 1, 2, 3, 0, 1, 2, 3}, // indices + {1.0f, 1.0f, 50.0f, 16.0f, 3.0f, 5.0f, 28.0f, 2.0f}, + }, +}; +typedef CSRTransposeTest CSRTransposeTestF; +TEST_P(CSRTransposeTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu new file mode 100644 index 0000000000..5d687ad92b --- /dev/null +++ b/cpp/test/sparse/degree.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../test_utils.h" + +#include + +#include + +namespace raft { +namespace sparse { + +template +struct SparseDegreeInputs { + int m, n, nnz; + unsigned long long int seed; +}; + +template +class SparseDegreeTests + : public ::testing::TestWithParam> { + protected: + void SetUp() override {} + + void TearDown() override {} + + protected: + SparseDegreeInputs params; +}; + +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; + +typedef SparseDegreeTests COODegree; +TEST_P(COODegree, Result) { + int *in_rows, *verify, *results; + + int in_rows_h[5] = {0, 0, 1, 2, 2}; + int verify_h[5] = {2, 1, 2, 0, 0}; + + raft::allocate(in_rows, 5); + raft::allocate(verify, 5, true); + raft::allocate(results, 5, true); + + raft::update_device(in_rows, *&in_rows_h, 5, 0); + raft::update_device(verify, *&verify_h, 5, 0); + + linalg::coo_degree<32>(in_rows, 5, results, 0); + cudaDeviceSynchronize(); + + ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); + + CUDA_CHECK(cudaFree(in_rows)); + CUDA_CHECK(cudaFree(verify)); +} + +typedef SparseDegreeTests COODegreeNonzero; +TEST_P(COODegreeNonzero, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *in_rows, *verify, *results; + float *in_vals; + + int in_rows_h[5] = {0, 0, 1, 2, 2}; + float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; + int verify_h[5] = {1, 0, 2, 0, 0}; + + raft::allocate(in_rows, 5); + raft::allocate(verify, 5, true); + raft::allocate(results, 5, true); + raft::allocate(in_vals, 5, true); + + raft::update_device(in_rows, *&in_rows_h, 5, 0); + raft::update_device(verify, *&verify_h, 5, 0); + raft::update_device(in_vals, *&in_vals_h, 5, 0); + + linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream); + cudaDeviceSynchronize(); + + ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); + + CUDA_CHECK(cudaFree(in_rows)); + CUDA_CHECK(cudaFree(verify)); + + CUDA_CHECK(cudaStreamDestroy(stream)); +} + +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, + ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, + ::testing::ValuesIn(inputsf)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu new file mode 100644 index 0000000000..a841da661d --- /dev/null +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../test_utils.h" + +namespace raft { +namespace sparse { +namespace distance { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseDistanceCOOSPMVInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + + raft::distance::DistanceType metric; + + float metric_arg = 0.0; +}; + +template +::std::ostream &operator<<( + ::std::ostream &os, + const SparseDistanceCOOSPMVInputs &dims) { + return os; +} + +template +class SparseDistanceCOOSPMVTest + : public ::testing::TestWithParam< + SparseDistanceCOOSPMVInputs> { + public: + template + void compute_dist(reduce_f reduce_func, accum_f accum_func, + write_f write_func, bool rev = true) { + raft::mr::device::buffer coo_rows( + dist_config.allocator, dist_config.stream, + max(dist_config.b_nnz, dist_config.a_nnz)); + + raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, + coo_rows.data(), dist_config.b_nnz, + dist_config.stream); + + balanced_coo_pairwise_generalized_spmv( + out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, + write_func); + + if (rev) { + raft::sparse::convert::csr_to_coo(dist_config.a_indptr, + dist_config.a_nrows, coo_rows.data(), + dist_config.a_nnz, dist_config.stream); + + balanced_coo_pairwise_generalized_spmv_rev( + out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, + write_func); + } + } + + void run_spmv() { + switch (params.metric) { + case raft::distance::DistanceType::InnerProduct: + compute_dist(Product(), Sum(), AtomicAdd(), true); + break; + case raft::distance::DistanceType::L2Unexpanded: + compute_dist(SqDiff(), Sum(), AtomicAdd()); + break; + case raft::distance::DistanceType::Canberra: + compute_dist( + [] __device__(value_t a, value_t b) { + return fabsf(a - b) / (fabsf(a) + fabsf(b)); + }, + Sum(), AtomicAdd()); + break; + case raft::distance::DistanceType::L1: + compute_dist(AbsDiff(), Sum(), AtomicAdd()); + break; + case raft::distance::DistanceType::Linf: + compute_dist(AbsDiff(), Max(), AtomicMax()); + break; + case raft::distance::DistanceType::LpUnexpanded: { + compute_dist(PDiff(params.metric_arg), Sum(), AtomicAdd()); + float p = 1.0f / params.metric_arg; + raft::linalg::unaryOp( + out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows, + [=] __device__(value_t input) { return powf(input, p); }, + dist_config.stream); + + } break; + default: + throw raft::exception("Unknown distance"); + } + } + + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + + allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + stream); + } + + void SetUp() override { + params = ::testing::TestWithParam< + SparseDistanceCOOSPMVInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + + CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); + + make_data(); + + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr; + dist_config.b_indices = indices; + dist_config.b_data = data; + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr; + dist_config.a_indices = indices; + dist_config.a_data = data; + dist_config.handle = cusparseHandle; + dist_config.allocator = alloc; + dist_config.stream = stream; + + int out_size = dist_config.a_nrows * dist_config.b_nrows; + + allocate(out_dists, out_size); + + run_spmv(); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out_dists)); + CUDA_CHECK(cudaFree(out_dists_ref)); + } + + void compare() { + raft::print_device_vector("expected: ", out_dists_ref, + params.out_dists_ref_h.size(), std::cout); + raft::print_device_vector("out_dists: ", out_dists, + params.out_dists_ref_h.size(), std::cout); + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, + params.out_dists_ref_h.size(), + CompareApprox(1e-3))); + } + + protected: + cudaStream_t stream; + cusparseHandle_t cusparseHandle; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_t *out_dists, *out_dists_ref; + + raft::sparse::distance::distances_config_t dist_config; + + SparseDistanceCOOSPMVInputs params; +}; + +const std::vector> inputs_i32_f = { + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0}, + raft::distance::DistanceType::InnerProduct}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + raft::distance::DistanceType::L2Unexpanded}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf}, + + {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1} + +}; + +typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestF; +TEST_P(SparseDistanceCOOSPMVTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, SparseDistanceCOOSPMVTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // namespace distance +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/dist_csr_spmv.cu b/cpp/test/sparse/dist_csr_spmv.cu new file mode 100644 index 0000000000..2405909c40 --- /dev/null +++ b/cpp/test/sparse/dist_csr_spmv.cu @@ -0,0 +1,608 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "../test_utils.h" + +namespace raft { +namespace sparse { +namespace distance { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseDistanceCSRSPMVInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + + raft::distance::DistanceType metric; + + float metric_arg = 0.0; +}; + +template +::std::ostream &operator<<( + ::std::ostream &os, + const SparseDistanceCSRSPMVInputs &dims) { + return os; +} + +template +class SparseDistanceCSRSPMVTest + : public ::testing::TestWithParam< + SparseDistanceCSRSPMVInputs> { + public: + template + void compute_dist(reduce_f reduce_func, accum_f accum_func) { + generalized_csr_pairwise_semiring( + out_dists, dist_config, reduce_func, accum_func); + } + + void run_spmv() { + switch (params.metric) { + case raft::distance::DistanceType::InnerProduct: + compute_dist(Product(), Sum()); + break; + case raft::distance::DistanceType::L2Unexpanded: + compute_dist(SqDiff(), Sum()); + break; + case raft::distance::DistanceType::Canberra: + compute_dist( + [] __device__(value_t a, value_t b) { + value_t d = fabsf(a) + fabsf(b); + return ((d != 0) * fabsf(a - b)) / (d + (d == 0)); + }, + Sum()); + break; + case raft::distance::DistanceType::L1: + compute_dist(AbsDiff(), Sum()); + break; + case raft::distance::DistanceType::Linf: + compute_dist(AbsDiff(), Max()); + break; + case raft::distance::DistanceType::LpUnexpanded: { + compute_dist(PDiff(params.metric_arg), Sum()); + float pow = 1.0f / params.metric_arg; + raft::linalg::unaryOp( + out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows, + [=] __device__(value_t input) { return powf(input, pow); }, + dist_config.stream); + + } break; + default: + throw raft::exception("Unknown distance"); + } + } + + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + + allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + stream); + } + + void SetUp() override { + params = ::testing::TestWithParam< + SparseDistanceCSRSPMVInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + + CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); + + make_data(); + + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr; + dist_config.b_indices = indices; + dist_config.b_data = data; + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr; + dist_config.a_indices = indices; + dist_config.a_data = data; + dist_config.handle = cusparseHandle; + dist_config.allocator = alloc; + dist_config.stream = stream; + + int out_size = dist_config.a_nrows * dist_config.b_nrows; + + allocate(out_dists, out_size); + + run_spmv(); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out_dists)); + CUDA_CHECK(cudaFree(out_dists_ref)); + } + + void compare() { + raft::print_device_vector("expected: ", out_dists_ref, + params.out_dists_ref_h.size(), std::cout); + raft::print_device_vector("out_dists: ", out_dists, + params.out_dists_ref_h.size(), std::cout); + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, + params.out_dists_ref_h.size(), + CompareApprox(1e-3))); + } + + protected: + cudaStream_t stream; + cusparseHandle_t cusparseHandle; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_t *out_dists, *out_dists_ref; + + raft::sparse::distance::distances_config_t dist_config; + + SparseDistanceCSRSPMVInputs params; +}; + +const std::vector> inputs_i32_f = { + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0}, + raft::distance::DistanceType::InnerProduct}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + raft::distance::DistanceType::L2Unexpanded}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf}, + + {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1} + +}; + +typedef SparseDistanceCSRSPMVTest SparseDistanceCSRSPMVTestF; +TEST_P(SparseDistanceCSRSPMVTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseDistanceCSRSPMVTests, SparseDistanceCSRSPMVTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // namespace distance +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu new file mode 100644 index 0000000000..53e8838b65 --- /dev/null +++ b/cpp/test/sparse/distance.cu @@ -0,0 +1,764 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +#include "../test_utils.h" + +namespace raft { +namespace sparse { +namespace distance { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseDistanceInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + + raft::distance::DistanceType metric; + + float metric_arg = 0.0; +}; + +template +::std::ostream &operator<<( + ::std::ostream &os, const SparseDistanceInputs &dims) { + return os; +} + +template +class SparseDistanceTest + : public ::testing::TestWithParam> { + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + + allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + stream); + } + + void SetUp() override { + params = ::testing::TestWithParam< + SparseDistanceInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + + CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); + + make_data(); + + raft::sparse::distance::distances_config_t dist_config; + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr; + dist_config.b_indices = indices; + dist_config.b_data = data; + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr; + dist_config.a_indices = indices; + dist_config.a_data = data; + dist_config.handle = cusparseHandle; + dist_config.allocator = alloc; + dist_config.stream = stream; + + int out_size = dist_config.a_nrows * dist_config.b_nrows; + + allocate(out_dists, out_size); + + pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out_dists)); + CUDA_CHECK(cudaFree(out_dists_ref)); + } + + void compare() { + // skip Hellinger test due to sporadic CI issue + // https://github.com/rapidsai/cuml/issues/3477 + if (params.metric == raft::distance::DistanceType::HellingerExpanded) { + GTEST_SKIP(); + } else { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, + params.out_dists_ref_h.size(), + CompareApprox(1e-3))); + } + } + + protected: + cudaStream_t stream; + cusparseHandle_t cusparseHandle; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_t *out_dists, *out_dists_ref; + + SparseDistanceInputs params; +}; + +const std::vector> inputs_i32_f = { + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + raft::distance::DistanceType::L2Expanded}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0}, + raft::distance::DistanceType::InnerProduct}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + raft::distance::DistanceType::L2Unexpanded}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, + 0.58146987, 0.44940102, 1., 0.76978799, 0.39419924, 0., + 0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481, + 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., + 0.79593037, 0.48904013, 0.51413997, 0., 0.28605559, 0.35772784, + 1., 0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801, + 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, + 0.58623212, 0., 0.77917274, 0.48390993, 0.24558392, 0.99166225, + 0.58146987, 0.73323749, 0.67534399, 1., 0.6745457, 0.77917274, + 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., + 0.51360432, 0.68185144, 1., 0.54847744, 0.8321819, 0.43324829, + 0.67676228, 0.24558392, 0.76064776, 0.51360432, 0., 1., + 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + 0.61547536, 0.68185144, 1., 0.}, + raft::distance::DistanceType::CosineExpanded}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {0.0, + 0.42857142857142855, + 0.7142857142857143, + 0.75, + 0.2857142857142857, + 0.75, + 0.7142857142857143, + 0.5, + 1.0, + 0.6666666666666666, + 0.42857142857142855, + 0.0, + 0.75, + 0.625, + 0.375, + 0.42857142857142855, + 0.75, + 0.375, + 0.75, + 0.7142857142857143, + 0.7142857142857143, + 0.75, + 0.0, + 0.7142857142857143, + 0.42857142857142855, + 0.7142857142857143, + 0.6666666666666666, + 0.625, + 0.6666666666666666, + 1.0, + 0.75, + 0.625, + 0.7142857142857143, + 0.0, + 0.5, + 0.5714285714285714, + 1.0, + 0.8, + 0.5, + 0.6666666666666666, + 0.2857142857142857, + 0.375, + 0.42857142857142855, + 0.5, + 0.0, + 0.6666666666666666, + 0.7777777777777778, + 0.4444444444444444, + 0.7777777777777778, + 0.75, + 0.75, + 0.42857142857142855, + 0.7142857142857143, + 0.5714285714285714, + 0.6666666666666666, + 0.0, + 0.7142857142857143, + 0.5, + 0.5, + 0.8571428571428571, + 0.7142857142857143, + 0.75, + 0.6666666666666666, + 1.0, + 0.7777777777777778, + 0.7142857142857143, + 0.0, + 0.42857142857142855, + 0.8571428571428571, + 0.8333333333333334, + 0.5, + 0.375, + 0.625, + 0.8, + 0.4444444444444444, + 0.5, + 0.42857142857142855, + 0.0, + 0.7777777777777778, + 0.75, + 1.0, + 0.75, + 0.6666666666666666, + 0.5, + 0.7777777777777778, + 0.5, + 0.8571428571428571, + 0.7777777777777778, + 0.0, + 1.0, + 0.6666666666666666, + 0.7142857142857143, + 1.0, + 0.6666666666666666, + 0.75, + 0.8571428571428571, + 0.8333333333333334, + 0.75, + 1.0, + 0.0}, + raft::distance::DistanceType::JaccardExpanded}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf}, + + {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1}, + {10, + {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, + 0, 3, 7, 8, 12, 0, 2, 5, 7, 8, 14, 4, 9, 10, 11, + 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, + 0.73789274, 0.08450219, 1., 0.20184723, 0.18036963, 0.12581403, + 0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555, + 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, + 0.15605804, 0.3867739, 0.24908977, 0.36413632, 0.37643732, 0.28910679, + 0.0198409, 0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969, + 0.26190054, 0.2077349, 0.10803964}, + {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, + 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, + 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, + 1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01, + 9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01, + 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01, + 6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00, + 1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01, + 8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01, + 7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01, + 9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01, + 0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00, + 9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01, + 8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0, + 1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01, + 8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01, + 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01, + 8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01, + 1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01, + 7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01, + 6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01, + 9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00, + 0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01, + 1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01, + 7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08}, + // Dataset is L1 normalized into pdfs + raft::distance::DistanceType::HellingerExpanded}}; + +typedef SparseDistanceTest SparseDistanceTestF; +TEST_P(SparseDistanceTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // namespace distance +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu new file mode 100644 index 0000000000..f7954f899f --- /dev/null +++ b/cpp/test/sparse/filter.cu @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../test_utils.h" + +#include +#include +#include +#include + +#include + +namespace raft { +namespace sparse { + +template +struct SparseFilterInputs { + int m, n, nnz; + unsigned long long int seed; +}; + +template +class SparseFilterTests + : public ::testing::TestWithParam> { + protected: + void SetUp() override {} + + void TearDown() override {} + + protected: + SparseFilterInputs params; +}; + +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; + +typedef SparseFilterTests COORemoveZeros; +TEST_P(COORemoveZeros, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + params = ::testing::TestWithParam>::GetParam(); + + float *in_h_vals = new float[params.nnz]; + + COO in(alloc, stream, params.nnz, 5, 5); + + raft::random::Rng r(params.seed); + r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream); + + raft::update_host(in_h_vals, in.vals(), params.nnz, stream); + + in_h_vals[0] = 0; + in_h_vals[2] = 0; + in_h_vals[3] = 0; + + int *in_h_rows = new int[params.nnz]; + int *in_h_cols = new int[params.nnz]; + + for (int i = 0; i < params.nnz; i++) { + in_h_rows[i] = params.nnz - i - 1; + in_h_cols[i] = i; + } + + raft::update_device(in.rows(), in_h_rows, params.nnz, stream); + raft::update_device(in.cols(), in_h_cols, params.nnz, stream); + raft::update_device(in.vals(), in_h_vals, params.nnz, stream); + + op::coo_sort(&in, alloc, stream); + + int out_rows_ref_h[2] = {0, 3}; + int out_cols_ref_h[2] = {4, 1}; + + float *out_vals_ref_h = (float *)malloc(2 * sizeof(float)); + out_vals_ref_h[0] = in_h_vals[4]; + out_vals_ref_h[1] = in_h_vals[1]; + + COO out_ref(alloc, stream, 2, 5, 5); + COO out(alloc, stream); + + raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream); + raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream); + raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream); + + op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); + + ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, + raft::Compare())); + + CUDA_CHECK(cudaStreamDestroy(stream)); + free(out_vals_ref_h); + + delete[] in_h_rows; + delete[] in_h_cols; + delete[] in_h_vals; +} + +INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, + ::testing::ValuesIn(inputsf)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu new file mode 100644 index 0000000000..0f773b9fee --- /dev/null +++ b/cpp/test/sparse/knn.cu @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include "../test_utils.h" + +#include +#include +#include +#include + +namespace raft { +namespace sparse { +namespace selection { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseKNNInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + std::vector out_indices_ref_h; + + int k; + + int batch_size_index = 2; + int batch_size_query = 2; + + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2Expanded; +}; + +template +::std::ostream &operator<<(::std::ostream &os, + const SparseKNNInputs &dims) { + return os; +} + +template +class SparseKNNTest + : public ::testing::TestWithParam> { + protected: + void make_data() { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + printf("Allocating input\n"); + + allocate(indptr, indptr_h.size()); + allocate(indices, indices_h.size()); + allocate(data, data_h.size()); + + printf("Updating device\n"); + + update_device(indptr, indptr_h.data(), indptr_h.size(), stream); + update_device(indices, indices_h.data(), indices_h.size(), stream); + update_device(data, data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_indices_ref_h = params.out_indices_ref_h; + + printf("Allocating ref output\n"); + allocate(out_indices_ref, out_indices_ref_h.size()); + allocate(out_dists_ref, out_dists_ref_h.size()); + + printf("Updating device\n"); + + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + stream); + + printf("Allocating final output\n"); + + allocate(out_dists, n_rows * k); + allocate(out_indices, n_rows * k); + + printf("Done.\n"); + } + + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + + CUDA_CHECK(cudaStreamCreate(&stream)); + + CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); + + n_rows = params.indptr_h.size() - 1; + nnz = params.indices_h.size(); + k = params.k; + + printf("Making data\n"); + + make_data(); + + printf("About to run kselect\n"); + + raft::sparse::selection::brute_force_knn( + indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data, + nnz, n_rows, params.n_cols, out_indices, out_dists, k, cusparseHandle, + alloc, stream, params.batch_size_index, params.batch_size_query, + params.metric); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + printf("Executed k-select"); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(indptr)); + CUDA_CHECK(cudaFree(indices)); + CUDA_CHECK(cudaFree(data)); + CUDA_CHECK(cudaFree(out_indices)); + CUDA_CHECK(cudaFree(out_dists)); + CUDA_CHECK(cudaFree(out_indices_ref)); + CUDA_CHECK(cudaFree(out_dists_ref)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void compare() { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, + CompareApprox(1e-4))); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, + Compare())); + } + + protected: + cudaStream_t stream; + cusparseHandle_t cusparseHandle; + + int n_rows, nnz, k; + + // input data + value_idx *indptr, *indices; + value_t *data; + + // output data + value_idx *out_indices; + value_t *out_dists; + + value_idx *out_indices_ref; + value_t *out_dists_ref; + + SparseKNNInputs params; +}; + +const std::vector> inputs_i32_f = { + {9, // ncols + {0, 2, 4, 6, 8}, // indptr + {0, 4, 0, 3, 0, 2, 0, 8}, // indices + {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f}, // data + {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421}, // dists + {0, 3, 1, 0, 2, 0, 3, 0}, // inds + 2, + 2, + 2, + raft::distance::DistanceType::L2Expanded}}; +typedef SparseKNNTest SparseKNNTestF; +TEST_P(SparseKNNTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace selection +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu new file mode 100644 index 0000000000..7adbbf8b9a --- /dev/null +++ b/cpp/test/sparse/norm.cu @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include "../test_utils.h" + +#include +#include + +namespace raft { +namespace sparse { + +enum NormalizeMethod { MAX, L1 }; + +template +struct CSRRowNormalizeInputs { + NormalizeMethod method; + std::vector ex_scan; + std::vector in_vals; + std::vector verify; +}; + +template +class CSRRowNormalizeTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = ::testing::TestWithParam< + CSRRowNormalizeInputs>::GetParam(); + cudaStreamCreate(&stream); + + raft::allocate(in_vals, params.in_vals.size()); + raft::allocate(verify, params.verify.size()); + raft::allocate(ex_scan, params.ex_scan.size()); + raft::allocate(result, params.verify.size(), true); + } + + void Run() { + Index_ n_rows = params.ex_scan.size(); + Index_ nnz = params.in_vals.size(); + + raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); + raft::update_device(in_vals, params.in_vals.data(), nnz, stream); + raft::update_device(verify, params.verify.data(), nnz, stream); + + switch (params.method) { + case MAX: + linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, + result, stream); + break; + case L1: + linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, + result, stream); + break; + } + + ASSERT_TRUE( + raft::devArrMatch(verify, result, nnz, raft::Compare())); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(ex_scan)); + CUDA_CHECK(cudaFree(in_vals)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); + cudaStreamDestroy(stream); + } + + protected: + CSRRowNormalizeInputs params; + cudaStream_t stream; + Index_ *ex_scan; + Type_f *in_vals, *result, *verify; +}; + +using CSRRowNormalizeTestF = CSRRowNormalizeTest; +TEST_P(CSRRowNormalizeTestF, Result) { Run(); } + +using CSRRowNormalizeTestD = CSRRowNormalizeTest; +TEST_P(CSRRowNormalizeTestD, Result) { Run(); } + +const std::vector> csrnormalize_inputs_f = { + {MAX, + {0, 4, 8, 9}, + {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0}, + {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}}, + {L1, + {0, 4, 8, 9}, + {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0}, + {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, +}; +const std::vector> csrnormalize_inputs_d = { + {MAX, + {0, 4, 8, 9}, + {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0}, + {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}}, + {L1, + {0, 4, 8, 9}, + {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0}, + {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, +}; + +INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF, + ::testing::ValuesIn(csrnormalize_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD, + ::testing::ValuesIn(csrnormalize_inputs_d)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu new file mode 100644 index 0000000000..b64fa25883 --- /dev/null +++ b/cpp/test/sparse/row_op.cu @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include "../test_utils.h" + +#include +#include + +namespace raft { +namespace sparse { + +template +struct CSRRowOpInputs { + std::vector ex_scan; + std::vector verify; +}; + +/** Wrapper to call csr_row_op because the enclosing function of a __device__ + * lambda cannot have private ot protected access within the class. */ +template +void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz, + Type_f *result, cudaStream_t stream) { + op::csr_row_op( + row_ind, n_rows, nnz, + [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) { + for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row; + }, + stream); +} + +template +class CSRRowOpTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); + cudaStreamCreate(&stream); + n_rows = params.ex_scan.size(); + nnz = params.verify.size(); + + raft::allocate(verify, nnz); + raft::allocate(ex_scan, n_rows); + raft::allocate(result, nnz, true); + } + + void Run() { + raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); + raft::update_device(verify, params.verify.data(), nnz, stream); + + csr_row_op_wrapper(ex_scan, n_rows, nnz, result, stream); + + ASSERT_TRUE( + raft::devArrMatch(verify, result, nnz, raft::Compare())); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(ex_scan)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); + cudaStreamDestroy(stream); + } + + protected: + CSRRowOpInputs params; + cudaStream_t stream; + Index_ n_rows, nnz; + Index_ *ex_scan; + Type_f *result, *verify; +}; + +using CSRRowOpTestF = CSRRowOpTest; +TEST_P(CSRRowOpTestF, Result) { Run(); } + +using CSRRowOpTestD = CSRRowOpTest; +TEST_P(CSRRowOpTestD, Result) { Run(); } + +const std::vector> csrrowop_inputs_f = { + {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, +}; +const std::vector> csrrowop_inputs_d = { + {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, +}; + +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, + ::testing::ValuesIn(csrrowop_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, + ::testing::ValuesIn(csrrowop_inputs_d)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu new file mode 100644 index 0000000000..46f2f6a844 --- /dev/null +++ b/cpp/test/sparse/selection.cu @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "../test_utils.h" + +#include +#include + +namespace raft { +namespace sparse { +namespace selection { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseSelectionInputs { + value_idx n_rows; + value_idx n_cols; + + std::vector dists_h; + + std::vector out_dists_ref_h; + std::vector out_indices_ref_h; + + int k; + + bool select_min; +}; + +template +::std::ostream &operator<<( + ::std::ostream &os, const SparseSelectionInputs &dims) { + return os; +} + +template +class SparseSelectionTest + : public ::testing::TestWithParam> { + protected: + void make_data() { + std::vector dists_h = params.dists_h; + + allocate(dists, n_rows * n_cols); + update_device(dists, dists_h.data(), dists_h.size(), stream); + + allocate(inds, n_rows * n_cols); + iota_fill(inds, n_rows, n_cols, stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_indices_ref_h = params.out_indices_ref_h; + + allocate(out_indices_ref, out_indices_ref_h.size()); + allocate(out_dists_ref, out_dists_ref_h.size()); + + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + stream); + + allocate(out_dists, n_rows * k); + allocate(out_indices, n_rows * k); + } + + void SetUp() override { + params = ::testing::TestWithParam< + SparseSelectionInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + CUDA_CHECK(cudaStreamCreate(&stream)); + + n_rows = params.n_rows; + n_cols = params.n_cols; + k = params.k; + + make_data(); + + raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists, + out_indices, params.select_min, k, + stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaStreamSynchronize(stream)); + + CUDA_CHECK(cudaFree(dists)); + CUDA_CHECK(cudaFree(inds)); + CUDA_CHECK(cudaFree(out_indices)); + CUDA_CHECK(cudaFree(out_dists)); + CUDA_CHECK(cudaFree(out_indices_ref)); + CUDA_CHECK(cudaFree(out_dists_ref)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void compare() { + ASSERT_TRUE( + devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, + Compare())); + } + + protected: + cudaStream_t stream; + + int n_rows, n_cols, k; + + // input data + value_t *dists; + value_idx *inds; + + // output data + value_idx *out_indices; + value_t *out_dists; + + value_idx *out_indices_ref; + value_t *out_dists_ref; + + SparseSelectionInputs params; +}; + +const std::vector> inputs_i32_f = { + {5, + 5, + {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, + 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, + {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0}, + {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}, + 5, + true}}; +typedef SparseSelectionTest SparseSelectionTestF; +TEST_P(SparseSelectionTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF, + ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace selection +}; // end namespace sparse +}; // end namespace raft diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu new file mode 100644 index 0000000000..b9a8b849eb --- /dev/null +++ b/cpp/test/sparse/sort.cu @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../test_utils.h" + +#include +#include + +#include + +namespace raft { +namespace sparse { + +template +struct SparseSortInput { + int m, n, nnz; + unsigned long long int seed; +}; + +template +class SparseSortTest : public ::testing::TestWithParam> { + protected: + void SetUp() override {} + + void TearDown() override {} + + protected: + SparseSortInput params; +}; + +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; + +typedef SparseSortTest COOSort; +TEST_P(COOSort, Result) { + int *in_rows, *in_cols, *verify; + float *in_vals; + + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + + raft::allocate(in_vals, params.nnz); + r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); + + int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); + int *in_cols_h = (int *)malloc(params.nnz * sizeof(int)); + int *verify_h = (int *)malloc(params.nnz * sizeof(int)); + + for (int i = 0; i < params.nnz; i++) { + in_rows_h[i] = params.nnz - i - 1; + verify_h[i] = i; + in_cols_h[i] = i; + } + + raft::allocate(in_rows, params.nnz); + raft::allocate(in_cols, params.nnz); + raft::allocate(verify, params.nnz); + + raft::update_device(in_rows, in_rows_h, params.nnz, stream); + + raft::update_device(in_cols, in_cols_h, params.nnz, stream); + raft::update_device(verify, verify_h, params.nnz, stream); + + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, + stream); + + ASSERT_TRUE( + raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); + + delete[] in_rows_h; + delete[] in_cols_h; + delete[] verify_h; + + CUDA_CHECK(cudaFree(in_rows)); + CUDA_CHECK(cudaFree(in_cols)); + CUDA_CHECK(cudaFree(in_vals)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaStreamDestroy(stream)); +} + +INSTANTIATE_TEST_CASE_P(SparseSortTest, COOSort, ::testing::ValuesIn(inputsf)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu new file mode 100644 index 0000000000..07dd9d11a2 --- /dev/null +++ b/cpp/test/sparse/symmetrize.cu @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "../test_utils.h" + +#include +#include + +#include + +namespace raft { +namespace sparse { + +template +struct SparseSymmetrizeInput { + int m, n, nnz; + unsigned long long int seed; +}; + +template +class SparseSymmetrizeTest + : public ::testing::TestWithParam> { + protected: + void SetUp() override {} + + void TearDown() override {} + + protected: + SparseSymmetrizeInput params; +}; + +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; + +typedef SparseSymmetrizeTest COOSymmetrize; +TEST_P(COOSymmetrize, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); + + std::shared_ptr alloc( + new raft::mr::device::default_allocator); + + int nnz = 8; + + int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; + float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; + + int *exp_rows_h = + new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; + int *exp_cols_h = + new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; + float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, + 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; + + COO in(alloc, stream, nnz, 4, 4); + raft::update_device(in.rows(), *&in_rows_h, nnz, stream); + raft::update_device(in.cols(), *&in_cols_h, nnz, stream); + raft::update_device(in.vals(), *&in_vals_h, nnz, stream); + + COO out(alloc, stream); + + linalg::coo_symmetrize<32, float>( + &in, &out, + [] __device__(int row, int col, float val, float trans) { + return val + trans; + }, + alloc, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + std::cout << out << std::endl; + + ASSERT_TRUE(out.nnz == nnz * 2); + ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, + raft::Compare())); + + cudaStreamDestroy(stream); + + delete[] in_rows_h; + delete[] in_cols_h; + delete[] in_vals_h; + + delete[] exp_rows_h; + delete[] exp_cols_h; + delete[] exp_vals_h; +} + +INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, COOSymmetrize, + ::testing::ValuesIn(inputsf)); + +} // namespace sparse +} // namespace raft From 9798885207aded2cceb37e05053da5a8f59ab206 Mon Sep 17 00:00:00 2001 From: afender Date: Thu, 11 Feb 2021 15:55:38 -0600 Subject: [PATCH 06/11] perf check --- cpp/include/raft/handle.hpp | 2 ++ cpp/test/handle.cpp | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index a42fdd67b2..92fd6c2663 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -71,6 +71,8 @@ class handle_t { handle_t(const handle_t& h) : dev_id_(h.get_device()) {} handle_t(const handle_t&& h) : dev_id_(h.get_device()) {} + // light copy operator + // skip streams, comms, and libs handles handle_t& operator=(const handle_t& h) { prop_ = h.get_device_properties(); device_prop_initialized_ = true; diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 8fef4ead61..ead7382b1b 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -64,6 +64,18 @@ TEST(Raft, GetHandleFromPool) { ASSERT_EQ(parent.get_device(), child.get_device()); } +TEST(Raft, GetHandleFromPoolPerf) { + handle_t parent(100); + auto start = curTimeMillis(); + for (int i = 0; i < parent.get_num_internal_streams(); i++) { + auto child = parent.get_handle_from_internal_pool(i); + ASSERT_EQ(parent.get_internal_stream(i), child.get_stream()); + child.wait_on_user_stream(); + } + // upperbound on 0.1ms per child handle + ASSERT_LE(curTimeMillis() - start, 10); +} + TEST(Raft, GetHandleStreamViews) { handle_t parent(4); From def166f7c26d73e91db6a1d554dea6a32381f603 Mon Sep 17 00:00:00 2001 From: afender Date: Fri, 12 Feb 2021 12:26:36 -0600 Subject: [PATCH 07/11] reviews --- cpp/include/raft/handle.hpp | 34 +++++++++++++++++----------------- cpp/test/handle.cpp | 6 +++--- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 92fd6c2663..42a1e4ebb8 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -68,17 +68,25 @@ class handle_t { host_allocator_(std::make_shared()) { create_resources(); } - handle_t(const handle_t& h) : dev_id_(h.get_device()) {} - handle_t(const handle_t&& h) : dev_id_(h.get_device()) {} - // light copy operator - // skip streams, comms, and libs handles - handle_t& operator=(const handle_t& h) { - prop_ = h.get_device_properties(); + /** + * @brief Construct a light handle copy from another + * user stream, cuda handles, comms and worker pool are not copied + * The user_stream of the returned handle is set to the specified stream + * of the other handle worker pool + * @param[in] stream_id stream id in `other` worker streams + * to be set as user stream in the constructed handle + * @param[in] n_streams number worker streams to be created + */ + handle_t(const handle_t& other, int stream_id, + int n_streams = kNumDefaultWorkerStreams) + : dev_id_(other.get_device()), streams_(n_streams) { + prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = get_device_allocator(); - host_allocator_ = get_host_allocator(); - return *this; + device_allocator_ = other.get_device_allocator(); + host_allocator_ = other.get_host_allocator(); + create_resources(); + set_stream(other.get_internal_stream(stream_id)); } /** Destroys all held-up resources */ @@ -160,14 +168,6 @@ class handle_t { return int_streams_vec; } - handle_t get_handle_from_internal_pool( - int stream_id, int n_streams = kNumDefaultWorkerStreams) const { - handle_t handle(n_streams); - handle = *this; - handle.set_stream(this->get_internal_stream(stream_id)); - return handle; - } - void wait_on_user_stream() const { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index ead7382b1b..4cb9809844 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -53,7 +53,7 @@ TEST(Raft, GetInternalStreams) { TEST(Raft, GetHandleFromPool) { handle_t parent(4); - auto child = parent.get_handle_from_internal_pool(2); + handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream(2), child.get_stream()); ASSERT_EQ(0, child.get_num_internal_streams()); @@ -68,7 +68,7 @@ TEST(Raft, GetHandleFromPoolPerf) { handle_t parent(100); auto start = curTimeMillis(); for (int i = 0; i < parent.get_num_internal_streams(); i++) { - auto child = parent.get_handle_from_internal_pool(i); + handle_t child(parent, i); ASSERT_EQ(parent.get_internal_stream(i), child.get_stream()); child.wait_on_user_stream(); } @@ -79,7 +79,7 @@ TEST(Raft, GetHandleFromPoolPerf) { TEST(Raft, GetHandleStreamViews) { handle_t parent(4); - auto child = parent.get_handle_from_internal_pool(2); + handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value()); From 3b495a631c47650640d0cb5f1fb2f47e3283c9c8 Mon Sep 17 00:00:00 2001 From: Jordan Jacobelli Date: Tue, 16 Feb 2021 20:22:30 +0100 Subject: [PATCH 08/11] Add GHA to mark issues/prs as stale/rotten (#150) Issues and PRs without activity for 30d will be marked as stale. If there is no activity for 90d, they will be marked as rotten. Authors: - Jordan Jacobelli (@Ethyling) Approvers: - Dillon Cullinan (@dillon-cullinan) URL: https://github.com/rapidsai/raft/pull/150 --- .github/workflows/stale.yaml | 65 ++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 .github/workflows/stale.yaml diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml new file mode 100644 index 0000000000..3b7de7ec69 --- /dev/null +++ b/.github/workflows/stale.yaml @@ -0,0 +1,65 @@ +name: Mark stale and rotten issues and pull requests + +on: + schedule: + - cron: "0 * * * *" + +jobs: + mark-stale-issues: + runs-on: ubuntu-latest + steps: + - name: Mark Issues as Stale + uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-issue-message: > + This issue has been marked stale due to no recent activity in the past 30d. + Please close this issue if no further response or action is needed. + Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. + This issue will be marked rotten if there is no activity in the next 60d. + stale-issue-label: "stale" + days-before-issue-stale: 30 + days-before-issue-close: -1 + mark-stale-prs: + runs-on: ubuntu-latest + steps: + - name: Mark PRs as Stale + uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: > + This PR has been marked stale due to no recent activity in the past 30d. + Please close this PR if it is no longer required. + Otherwise, please respond with a comment indicating any updates. + This PR will be marked rotten if there is no activity in the next 60d. + stale-pr-label: "stale" + days-before-pr-stale: 30 + days-before-pr-close: -1 + mark-rotten-issues: + runs-on: ubuntu-latest + steps: + - name: Mark Issues as Rotten + uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-issue-message: > + This issue has been marked rotten due to no recent activity in the past 90d. + Please close this issue if no further response or action is needed. + Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. + stale-issue-label: "rotten" + days-before-issue-stale: 90 + days-before-issue-close: -1 + mark-rotten-prs: + runs-on: ubuntu-latest + steps: + - name: Mark PRs as Rotten + uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: > + This PR has been marked rotten due to no recent activity in the past 90d. + Please close this PR if it is no longer required. + Otherwise, please respond with a comment indicating any updates. + stale-pr-label: "rotten" + days-before-pr-stale: 90 + days-before-pr-close: -1 From ac15d6932ab2f6ab23d9ae69d147a3708961dfc1 Mon Sep 17 00:00:00 2001 From: Mike Wendt <1915404+mike-wendt@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:11:52 -0500 Subject: [PATCH 09/11] Update stale GHA with exemptions & new labels (#152) Follows #150 Updates the stale GHA with the following changes: - [x] Uses `inactive-30d` and `inactive-90d` labels instead of `stale` and `rotten` - [x] Updates comments to reflect changes in labels - [x] Exempts the following labels from being marked `inactive-30d` or `inactive-90d` - `0 - Blocked` - `0 - Backlog` - `good first issue` Authors: - Mike Wendt (@mike-wendt) Approvers: - Ray Douglass (@raydouglass) URL: https://github.com/rapidsai/raft/pull/152 --- .github/workflows/stale.yaml | 50 +++++++++++++++--------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 3b7de7ec69..8b65da69aa 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -1,65 +1,57 @@ -name: Mark stale and rotten issues and pull requests +name: Mark inactive issues and pull requests on: schedule: - cron: "0 * * * *" jobs: - mark-stale-issues: + mark-inactive-30d: runs-on: ubuntu-latest steps: - - name: Mark Issues as Stale + - name: Mark 30 day inactive issues and pull requests uses: actions/stale@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: > - This issue has been marked stale due to no recent activity in the past 30d. + This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. - This issue will be marked rotten if there is no activity in the next 60d. - stale-issue-label: "stale" + This issue will be labeled `inactive-90d` if there is no activity in the next 60 days. + stale-issue-label: "inactive-30d" + exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue" days-before-issue-stale: 30 days-before-issue-close: -1 - mark-stale-prs: - runs-on: ubuntu-latest - steps: - - name: Mark PRs as Stale - uses: actions/stale@v3 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: > - This PR has been marked stale due to no recent activity in the past 30d. + This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days. Please close this PR if it is no longer required. Otherwise, please respond with a comment indicating any updates. - This PR will be marked rotten if there is no activity in the next 60d. - stale-pr-label: "stale" + This PR will be labeled `inactive-90d` if there is no activity in the next 60 days. + stale-pr-label: "inactive-30d" + exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue" days-before-pr-stale: 30 days-before-pr-close: -1 - mark-rotten-issues: + operations-per-run: 50 + mark-inactive-90d: runs-on: ubuntu-latest steps: - - name: Mark Issues as Rotten + - name: Mark 90 day inactive issues and pull requests uses: actions/stale@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: > - This issue has been marked rotten due to no recent activity in the past 90d. + This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. - stale-issue-label: "rotten" + stale-issue-label: "inactive-90d" + exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue" days-before-issue-stale: 90 days-before-issue-close: -1 - mark-rotten-prs: - runs-on: ubuntu-latest - steps: - - name: Mark PRs as Rotten - uses: actions/stale@v3 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: > - This PR has been marked rotten due to no recent activity in the past 90d. + This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days. Please close this PR if it is no longer required. Otherwise, please respond with a comment indicating any updates. - stale-pr-label: "rotten" + stale-pr-label: "inactive-90d" + exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue" days-before-pr-stale: 90 days-before-pr-close: -1 + operations-per-run: 50 From 30e341f483003a1094e326c49780c9382846d867 Mon Sep 17 00:00:00 2001 From: afender Date: Wed, 17 Feb 2021 17:56:56 -0600 Subject: [PATCH 10/11] error check --- cpp/include/raft/handle.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 42a1e4ebb8..dbe7e83189 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -81,6 +81,9 @@ class handle_t { handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) : dev_id_(other.get_device()), streams_(n_streams) { + RAFT_EXPECTS( + other.get_num_internal_streams() > 0, + "ERROR: the main handle must have at least one worker stream\n"); prop_ = other.get_device_properties(); device_prop_initialized_ = true; device_allocator_ = other.get_device_allocator(); From 88fff801285b9192e0a89e29467b2c52e168e6b2 Mon Sep 17 00:00:00 2001 From: Joseph <68436579+jolorunyomi@users.noreply.github.com> Date: Thu, 18 Feb 2021 10:07:34 -0600 Subject: [PATCH 11/11] Auto-label PRs based on their content (#117) This PR adds the GitHub action [PR Labeler](https://github.com/actions/labeler) to auto-label PRs based on their content. Labeling is managed with a configuration file `.github/labeler.yml` using the following [options](https://github.com/actions/labeler#usage). Authors: - Joseph (@jolorunyomi) Approvers: - AJ Schmidt (@ajschmidt8) - Mike Wendt (@mike-wendt) - Rick Ratzel (@rlratzel) URL: https://github.com/rapidsai/raft/pull/117 --- .github/labeler.yml | 16 ++++++++++++++++ .github/workflows/labeler.yml | 11 +++++++++++ 2 files changed, 27 insertions(+) create mode 100644 .github/labeler.yml create mode 100644 .github/workflows/labeler.yml diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000000..9809e2cc2e --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,16 @@ +# https://github.com/actions/labeler#common-examples +# Adapted from https://github.com/rapidsai/raft/blob/main/.github/CODEOWNERS +# Labels culled from https://github.com/rapidsai/raft/labels + +python: + - 'python/**' + +cpp: + - 'cpp/**' + +CMake: + - '**/CMakeLists.txt' + - '**/cmake/**' + +gpuCI: + - 'ci/**' diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000000..55117f774a --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,11 @@ +name: "Pull Request Labeler" +on: +- pull_request_target + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@main + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}"