From f86d76586e8a4c2f825ad724fe3c68e3fa3b9b74 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 18:44:45 -0500 Subject: [PATCH 01/24] iMoving more linalg prims from cuml --- cpp/include/raft/linalg/detail/axpy.hpp | 1 - cpp/include/raft/linalg/detail/gemm.hpp | 1 - cpp/include/raft/linalg/detail/gemv.hpp | 1 - cpp/include/raft/linalg/detail/lstsq.hpp | 444 ++++++++++++++++++ .../raft/linalg/detail/reduce_cols_by_key.cuh | 81 ++++ .../raft/linalg/detail/reduce_rows_by_key.cuh | 431 +++++++++++++++++ cpp/include/raft/linalg/detail/rsvd.cuh | 412 ++++++++++++++++ cpp/include/raft/linalg/detail/ternary_op.cuh | 102 ++++ cpp/include/raft/linalg/lstsq.hpp | 98 ++++ cpp/include/raft/linalg/power.cuh | 63 +++ .../raft/linalg/reduce_cols_by_key.cuh | 54 +++ .../raft/linalg/reduce_rows_by_key.cuh | 97 ++++ cpp/include/raft/linalg/rsvd.cuh | 109 +++++ cpp/include/raft/linalg/sqrt.cuh | 44 ++ cpp/include/raft/linalg/ternary_op.cuh | 49 ++ cpp/test/CMakeLists.txt | 5 + cpp/test/linalg/power.cu | 135 ++++++ cpp/test/linalg/reduce_cols_by_key.cu | 124 +++++ cpp/test/linalg/reduce_rows_by_key.cu | 262 +++++++++++ cpp/test/linalg/rsvd.cu | 315 +++++++++++++ cpp/test/linalg/sqrt.cu | 114 +++++ cpp/test/linalg/ternary_op.cu | 107 +++++ 22 files changed, 3046 insertions(+), 3 deletions(-) create mode 100644 cpp/include/raft/linalg/detail/lstsq.hpp create mode 100644 cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh create mode 100644 cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh create mode 100644 cpp/include/raft/linalg/detail/rsvd.cuh create mode 100644 cpp/include/raft/linalg/detail/ternary_op.cuh create mode 100644 cpp/include/raft/linalg/lstsq.hpp create mode 100644 cpp/include/raft/linalg/power.cuh create mode 100644 cpp/include/raft/linalg/reduce_cols_by_key.cuh create mode 100644 cpp/include/raft/linalg/reduce_rows_by_key.cuh create mode 100644 cpp/include/raft/linalg/rsvd.cuh create mode 100644 cpp/include/raft/linalg/sqrt.cuh create mode 100644 cpp/include/raft/linalg/ternary_op.cuh create mode 100644 cpp/test/linalg/power.cu create mode 100644 cpp/test/linalg/reduce_cols_by_key.cu create mode 100644 cpp/test/linalg/reduce_rows_by_key.cu create mode 100644 cpp/test/linalg/rsvd.cu create mode 100644 cpp/test/linalg/sqrt.cu create mode 100644 cpp/test/linalg/ternary_op.cu diff --git a/cpp/include/raft/linalg/detail/axpy.hpp b/cpp/include/raft/linalg/detail/axpy.hpp index f5527bf10f..c0ce398de9 100644 --- a/cpp/include/raft/linalg/detail/axpy.hpp +++ b/cpp/include/raft/linalg/detail/axpy.hpp @@ -20,7 +20,6 @@ #include "cublas_wrappers.hpp" -#include #include namespace raft::linalg::detail { diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 0ea1723a9e..29308304d8 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -20,7 +20,6 @@ #include "cublas_wrappers.hpp" -#include #include namespace raft { diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp index 3692743152..ad2e5275cb 100644 --- a/cpp/include/raft/linalg/detail/gemv.hpp +++ b/cpp/include/raft/linalg/detail/gemv.hpp @@ -20,7 +20,6 @@ #include "cublas_wrappers.hpp" -#include #include namespace raft { diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp new file mode 100644 index 0000000000..e8aeccc9b0 --- /dev/null +++ b/cpp/include/raft/linalg/detail/lstsq.hpp @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { + namespace linalg { + namespace detail { + + namespace { + +/** Operate a CUDA event if we're in the concurrent mode; no-op otherwise. */ + struct DeviceEvent { + private: + cudaEvent_t e; + + public: + DeviceEvent(bool concurrent) { + if (concurrent) + RAFT_CUDA_TRY(cudaEventCreate(&e)); + else + e = nullptr; + } + + ~DeviceEvent() { + if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e)); + } + + operator cudaEvent_t() const { return e; } + + void record(cudaStream_t stream) { + if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream)); + } + + void wait(cudaStream_t stream) { + if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u)); + } + + void wait() { + if (e != nullptr) raft::interruptible::synchronize(e); + } + + DeviceEvent &operator=(const DeviceEvent &other) = delete; + }; + +/** + * @brief Tells if the viewed CUDA stream is implicitly synchronized with the given stream. + * + * This can happen e.g. + * if the two views point to the same stream + * or sometimes when one of them is the legacy default stream. + */ + bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b) { + // any stream is "synchronized" with itself + if (a.value() == b.value()) return true; + // legacy + blocking streams + unsigned int flags = 0; + if (a.is_default()) { + RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags)); + if ((flags & cudaStreamNonBlocking) == 0) return true; + } + if (b.is_default()) { + RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags)); + if ((flags & cudaStreamNonBlocking) == 0) return true; + } + return false; + } + + template + struct DivideByNonZero { + constexpr static const math_t + eps = math_t(1e-10); + + __device__ math_t + + operator()(const math_t a, const math_t b) const { + return raft::myAbs(b) >= eps ? a / b : a; + } + }; + + } // namespace + +/** Solves the linear ordinary least squares problem `Aw = b` + * Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine. + * + * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, + * so it's not guaranteed to stay unmodified. + */ + template + void lstsqSvdQR(const raft::handle_t &handle, + math_t *A, + const int n_rows, + const int n_cols, + const math_t *b, + math_t *w, + cudaStream_t stream) { + const int minmn = min(n_rows, n_cols); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + int cusolverWorkSetSize = 0; + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize( + cusolverH, n_rows, n_cols, &cusolverWorkSetSize)); + + rmm::device_uvector workset(cusolverWorkSetSize // cuSolver + + n_rows * minmn // U + + n_cols * n_cols // V + + minmn // S + + minmn // U^T * b + + 1 // devInfo + , + stream); + math_t *cusolverWorkSet = workset.data(); + math_t *U = cusolverWorkSet + cusolverWorkSetSize; + math_t *Vt = U + n_rows * minmn; + math_t *S = Vt + n_cols * n_cols; + math_t *Ub = S + minmn; + int *devInfo = reinterpret_cast(Ub + minmn); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd(cusolverH, + 'S', + 'S', + n_rows, + n_cols, + A, + n_rows, + S, + U, + n_rows, + Vt, + n_cols, + cusolverWorkSet, + cusolverWorkSetSize, + nullptr, + devInfo, + stream)); + raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream); + raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero(), stream); + raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream); + } + +/** Solves the linear ordinary least squares problem `Aw = b` + * Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER). + * + * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, + * so it's not guaranteed to stay unmodified. + */ + template + void lstsqSvdJacobi(const raft::handle_t &handle, + math_t *A, + const int n_rows, + const int n_cols, + const math_t *b, + math_t *w, + cudaStream_t stream) { + const int minmn = min(n_rows, n_cols); + gesvdjInfo_t gesvdj_params; + RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params)); + int cusolverWorkSetSize = 0; + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY( + raft::linalg::detail::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + 1, + n_rows, + n_cols, + A, + n_rows, + nullptr, + nullptr, + n_rows, + nullptr, + n_cols, + &cusolverWorkSetSize, + gesvdj_params)); + rmm::device_uvector workset(cusolverWorkSetSize // cuSolver + + n_rows * minmn // U + + n_cols * minmn // V + + minmn // S + + minmn // U^T * b + + 1 // devInfo + , + stream); + math_t *cusolverWorkSet = workset.data(); + math_t *U = cusolverWorkSet + cusolverWorkSetSize; + math_t *V = U + n_rows * minmn; + math_t *S = V + n_cols * minmn; + math_t *Ub = S + minmn; + int *devInfo = reinterpret_cast(Ub + minmn); + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + 1, + n_rows, + n_cols, + A, + n_rows, + S, + U, + n_rows, + V, + n_cols, + cusolverWorkSet, + cusolverWorkSetSize, + devInfo, + gesvdj_params, + stream)); + raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream); + raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero(), stream); + raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream); + } + +/** Solves the linear ordinary least squares problem `Aw = b` + * via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A). + * (`w = (A^T A)^-1 A^T b`) + */ + template + void lstsqEig(const raft::handle_t &handle, + const math_t *A, + const int n_rows, + const int n_cols, + const math_t *b, + math_t *w, + cudaStream_t stream) { + rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream); + rmm::cuda_stream_view multAbStream = mainStream; + bool concurrent = false; + { + int sp_size = handle.get_stream_pool_size(); + if (sp_size > 0) { + multAbStream = handle.get_stream_from_stream_pool(0); + // check if the two streams can run concurrently + if (!are_implicitly_synchronized(mainStream, multAbStream)) { + concurrent = true; + } else if (sp_size > 1) { + mainStream = multAbStream; + multAbStream = handle.get_stream_from_stream_pool(1); + concurrent = true; + } + } + } + // the event is created only if the given raft handle is capable of running + // at least two CUDA streams without implicit synchronization. + DeviceEvent multAbDone(concurrent); + + rmm::device_uvector workset(n_cols * n_cols * 3 + n_cols * 2, mainStream); + math_t *Q = workset.data(); + math_t *QS = Q + n_cols * n_cols; + math_t *covA = QS + n_cols * n_cols; + math_t *S = covA + n_cols * n_cols; + math_t *Ab = S + n_cols; + + // covA <- A* A + math_t alpha = math_t(1); + math_t beta = math_t(0); + raft::linalg::gemm(handle, + A, + n_rows, + n_cols, + A, + covA, + n_cols, + n_cols, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, + mainStream); + + // Ab <- A* b + raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream); + multAbDone.record(multAbStream); + + // Q S Q* <- covA + raft::common::nvtx::push_range("raft::linalg::eigDC"); + raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream); + raft::common::nvtx::pop_range(); + + // QS <- Q invS + raft::linalg::matrixVectorOp( + QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero(), mainStream); + // covA <- QS Q* == Q invS Q* == inv(A* A) + raft::linalg::gemm(handle, + QS, + n_cols, + n_cols, + Q, + covA, + n_cols, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_T, + alpha, + beta, + mainStream); + multAbDone.wait(mainStream); + // w <- covA Ab == Q invS Q* A b == inv(A* A) A b + raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream); + } + +/** Solves the linear ordinary least squares problem `Aw = b` + * via QR decomposition of `A = QR`. + * (triangular system of equations `Rw = Q^T b`) + * + * @param A[in/out] - input feature matrix. + * Warning: the content of this matrix is modified by the cuSOLVER routines. + * @param b[in/out] - input target vector. + * Warning: the content of this vector is modified by the cuSOLVER routines. + */ + template + void lstsqQR(const raft::handle_t &handle, + math_t *A, + const int n_rows, + const int n_cols, + math_t *b, + math_t *w, + cudaStream_t stream) { + cublasHandle_t cublasH = handle.get_cublas_handle(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int m = n_rows; + int n = n_cols; + + int info = 0; + rmm::device_uvector d_tau(n, stream); + rmm::device_scalar d_info(stream); + + const cublasSideMode_t side = CUBLAS_SIDE_LEFT; + const cublasOperation_t trans = CUBLAS_OP_T; + + int lwork_geqrf = 0; + int lwork_ormqr = 0; + int lwork = 0; + + const int lda = m; + const int ldb = m; + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY( + raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf)); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH, + side, + trans, + m, + 1, + n, + A, + lda, + d_tau.data(), + b, // C, + lda, // ldc, + &lwork_ormqr)); + + lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr; + + rmm::device_uvector d_work(lwork, stream); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf( + cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + ASSERT(0 == info, "lstsq.h: QR wasn't successful"); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH, + side, + trans, + m, + 1, + n, + A, + lda, + d_tau.data(), + b, + ldb, + d_work.data(), + lwork, + d_info.data(), + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + ASSERT(0 == info, "lstsq.h: QR wasn't successful"); + + const math_t one = 1; + + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH, + side, + CUBLAS_FILL_MODE_UPPER, + CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, + n, + 1, + &one, + A, + lda, + b, + ldb, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream)); + } + + }; // namespace detail + }; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh new file mode 100644 index 0000000000..307ed30c57 --- /dev/null +++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { + namespace linalg { + namespace detail { + +///@todo: support col-major +///@todo: specialize this to support shared-mem based atomics + + template + __global__ void reduce_cols_by_key_kernel( + const T *data, const KeyIteratorT keys, T *out, IdxType nrows, IdxType ncols, IdxType nkeys) { + typedef typename std::iterator_traits::value_type KeyType; + + IdxType idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= (nrows * ncols)) return; + ///@todo: yikes! use fast-int-div + IdxType colId = idx % ncols; + IdxType rowId = idx / ncols; + KeyType key = keys[colId]; + raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]); + } + +/** + * @brief Computes the sum-reduction of matrix columns for each given key + * @tparam T the input data type (as well as the output reduced matrix) + * @tparam KeyType data type of the keys + * @tparam IdxType indexing arithmetic type + * @param data the input data (dim = nrows x ncols). This is assumed to be in + * row-major layout + * @param keys keys array (len = ncols). It is assumed that each key in this + * array is between [0, nkeys). In case this is not true, the caller is expected + * to have called make_monotonic primitive to prepare such a contiguous and + * monotonically increasing keys array. + * @param out the output reduced matrix along columns (dim = nrows x nkeys). + * This will be assumed to be in row-major layout + * @param nrows number of rows in the input data + * @param ncols number of colums in the input data + * @param nkeys number of unique keys in the keys array + * @param stream cuda stream to launch the kernel onto + */ + template + void reduce_cols_by_key(const T *data, + const KeyIteratorT keys, + T *out, + IdxType nrows, + IdxType ncols, + IdxType nkeys, + cudaStream_t stream) { + typedef typename std::iterator_traits::value_type KeyType; + + RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream)); + constexpr int TPB = 256; + int nblks = (int) raft::ceildiv(nrows * ncols, TPB); + reduce_cols_by_key_kernel<<>>(data, keys, out, nrows, ncols, nkeys); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + }; // end namespace detail + }; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh new file mode 100644 index 0000000000..f4fa892472 --- /dev/null +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +#define MAX_BLOCKS 65535u +namespace raft { + namespace linalg { + namespace detail { + +// +// Small helper function to convert from int->char and char->int +// Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars +// + + template + void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) { + for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) { + dst[idx] = src[idx]; + } + } + + template + void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) { + dim3 grid, block; + block.x = 256; + + grid.x = raft::ceildiv(n, (int) block.x); + grid.x = std::min(grid.x, MAX_BLOCKS); + + convert_array_kernel<<>>(dst, src, n); + } + + template + struct quad { + T x, y, z, w; + }; + +// +// Functor for reduce by key, small k +// + template + struct quadSum { + __host__ __device__ __forceinline__ quad operator()(const quad &a, const quad &b) const { + // wasting a double4.. + quad c; + c.x = a.x + b.x; + c.y = a.y + b.y; + c.z = a.z + b.z; + c.w = a.w + b.w; + + return c; + } + }; + +// +// Reduce by keys +// We need to sum each dimension by labels +// The labels are not adjacent +// + +// +// Reduce by keys - for keys <= 4 +// + +#define SUM_ROWS_SMALL_K_DIMX 256 +#define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4 + template + __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) + + __global__ + void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, + int lda, + const char *d_keys, + const WeightT *d_weights, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums) { + typedef typename std::iterator_traits::value_type DataType; + typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int idim = static_cast(blockIdx.y); idim < ncols; idim += gridDim.y) { + if (idim != static_cast(blockIdx.y)) __syncthreads(); // we're reusing temp_storage + + // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg + quad thread_sums; + thread_sums.x = 0.0; + thread_sums.y = 0.0; + thread_sums.z = 0.0; + thread_sums.w = 0.0; + + // May use vectorized load - not necessary for doubles + for (int block_offset_irow = blockIdx.x * blockDim.x; + block_offset_irow < nrows; // we will syncthreads() inside the loop, no CTA divergence + block_offset_irow += blockDim.x * gridDim.x) { + int irow = block_offset_irow + threadIdx.x; + DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0; + if (d_weights && irow < nrows) { val = val * d_weights[irow]; } + // we are not reusing the keys - after profiling + // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded + // (experimentation gave a 10% speed up - not worth the many code lines added) + int row_key = (irow < nrows) ? d_keys[irow] : -1; + + thread_sums.x += (row_key == 0) ? val : 0.0; + thread_sums.y += (row_key == 1) ? val : 0.0; + thread_sums.z += (row_key == 2) ? val : 0.0; + thread_sums.w += (row_key == 3) ? val : 0.0; + } + + // End of column + // Saving local sums back to global mem + + // Strided access + + // Reducing by key + thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum()); + + if (threadIdx.x < 32) { + // We only need 4 + thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff); + if (static_cast(threadIdx.x) < nkeys) { + if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x); + if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y); + if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z); + if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w); + } + } + } + } + + template + void sum_rows_by_key_small_nkeys(const DataIteratorT d_A, + int lda, + const char *d_keys, + const WeightT *d_weights, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t st) { + dim3 grid, block; + block.x = SUM_ROWS_SMALL_K_DIMX; + block.y = 1; // Necessary + + grid.x = raft::ceildiv(nrows, (int) block.x); + grid.x = std::min(grid.x, 32u); + grid.y = ncols; + grid.y = std::min(grid.y, MAX_BLOCKS); + sum_rows_by_key_small_nkeys_kernel<<>>( + d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums); + } + +// +// Reduce by keys - large number of keys +// Computing a "weigthed histogram" with local histograms in smem +// Keeping it simple - not optimized +// + +#define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024 + + template + __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT *d_weights, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums) { + typedef typename std::iterator_traits::value_type KeyType; + typedef typename std::iterator_traits::value_type DataType; + __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K]; + + for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) + local_sums[local_key] = 0.0; + + for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) { + __syncthreads(); // local_sums + + // At this point local_sums if full of zeros + + for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows; + irow += blockDim.x * gridDim.x) { + // Branch div in this loop - not an issue with current code + DataType val = d_A[idim * lda + irow]; + if (d_weights) val = val * d_weights[irow]; + + int local_key = d_keys[irow] - key_offset; + + // We could load next val here + raft::myAtomicAdd(&local_sums[local_key], val); + } + + __syncthreads(); // local_sums + + for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) { + DataType local_sum = local_sums[local_key]; + + if (local_sum != 0.0) { + KeyType global_key = key_offset + local_key; + raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum); + local_sums[local_key] = 0.0; + } + } + } + } + + template + void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A, + int lda, + KeysIteratorT d_keys, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums, + cudaStream_t st) { + dim3 grid, block; + block.x = SUM_ROWS_SMALL_K_DIMX; + block.y = 1; // Necessary + + grid.x = raft::ceildiv(nrows, (int) block.x); + grid.x = std::min(grid.x, 32u); + grid.y = ncols; + grid.y = std::min(grid.y, MAX_BLOCKS); + sum_rows_by_key_large_nkeys_kernel_colmajor<<>>( + d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums); + } + +#define RRBK_SHMEM_SZ 32 + +//#define RRBK_SHMEM + template + __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, + int lda, + const WeightT *d_weights, + KeysIteratorT d_keys, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums) { + typedef typename std::iterator_traits::value_type KeyType; + typedef typename std::iterator_traits::value_type DataType; + +#ifdef RRBK_SHMEM + __shared__ KeyType sh_keys[RRBK_SHMEM_SZ]; +#endif + int rows_per_partition = nrows / gridDim.z + 1; + int start_row = blockIdx.z * rows_per_partition; + int end_row = start_row + rows_per_partition; + end_row = end_row > nrows ? nrows : end_row; + + KeyType local_key = blockIdx.y; + if (local_key >= nkeys) return; + int this_col = threadIdx.x + blockIdx.x * blockDim.x; + if (this_col >= ncols) return; + + DataType sum = 0.0; + KeyType global_key = key_offset + local_key; +#ifdef RRBK_SHMEM + int sh_key_inx = 0; +#endif + for (int r = start_row; r < end_row; r++) { +#ifdef RRBK_SHMEM + if (0 == sh_key_inx % RRBK_SHMEM_SZ) { + for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x) + sh_keys[x] = d_keys[r + x]; + __syncthreads(); + } + if (sh_keys[sh_key_inx] != global_key) continue; // No divergence since global_key is the + // same for the whole block + sh_key_inx++; +#else + if (d_keys[r] != global_key) + continue; // No divergence since global_key is the + // same for the whole block +#endif + // if ((end_row-start_row) / (r-start_row) != global_key) continue; + DataType val = __ldcg(&d_A[r * lda + this_col]); + if (d_weights) { val = val * d_weights[r]; } + sum += val; + } + + if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum); + } + + template + void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT *d_weights, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums, + cudaStream_t st) { + // x-dim refers to the column in the input data + // y-dim refers to the key + // z-dim refers to a partitioning of the rows among the threadblocks + dim3 grid, block; + block.x = 256; // Adjust me! + block.y = 1; // Don't adjust me! + grid.x = raft::ceildiv(ncols, (int) block.x); + grid.y = nkeys; + grid.z = std::max(40960000 / nkeys / ncols, (int) 1); // Adjust me! + grid.z = std::min(grid.z, (unsigned int) nrows); + grid.z = std::min(grid.z, MAX_BLOCKS); + + sum_rows_by_key_large_nkeys_kernel_rowmajor<<>>( + d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums); + } + +/** + * @brief Computes the weighted reduction of matrix rows for each given key + * + * @tparam DataIteratorT Random-access iterator type, for reading input matrix + * (may be a simple pointer type) + * @tparam KeysIteratorT Random-access iterator type, for reading input keys + * (may be a simple pointer type) + * + * @param[in] d_A Input data array (lda x nrows) + * @param[in] lda Real row size for input data, d_A + * @param[in] d_keys Keys for each row (1 x nrows) + * @param[in] d_weights Weights for each observation in d_A (1 x nrows) + * @param[out] d_keys_char Scratch memory for conversion of keys to char + * @param[in] nrows Number of rows in d_A and d_keys + * @param[in] ncols Number of data columns in d_A + * @param[in] nkeys Number of unique keys in d_keys + * @param[out] d_sums Row sums by key (ncols x d_keys) + * @param[in] stream CUDA stream + */ + template + void reduce_rows_by_key(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT *d_weights, + char *d_keys_char, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t stream) { + typedef typename std::iterator_traits::value_type KeyType; + typedef typename std::iterator_traits::value_type DataType; + + // Following kernel needs memset + cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream); + + if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) { + // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW + // with doubles we have ~20% speed up - with floats we can hope something around 2x + // Converting d_keys to char + convert_array(d_keys_char, d_keys, nrows, stream); + sum_rows_by_key_small_nkeys( + d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream); + } else { + for (KeyType key_offset = 0; key_offset < static_cast(nkeys); + key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) { + KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys); + sum_rows_by_key_large_nkeys_rowmajor( + d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream); + } + } + } + +/** + * @brief Computes the reduction of matrix rows for each given key + * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple + * pointer type) + * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple + * pointer type) + * @param[in] d_A Input data array (lda x nrows) + * @param[in] lda Real row size for input data, d_A + * @param[in] d_keys Keys for each row (1 x nrows) + * @param d_keys_char Scratch memory for conversion of keys to char + * @param[in] nrows Number of rows in d_A and d_keys + * @param[in] ncols Number of data columns in d_A + * @param[in] nkeys Number of unique keys in d_keys + * @param[out] d_sums Row sums by key (ncols x d_keys) + * @param[in] stream CUDA stream + */ + template + void reduce_rows_by_key(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + char *d_keys_char, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t stream) { + typedef typename std::iterator_traits::value_type DataType; + reduce_rows_by_key(d_A, + lda, + d_keys, + static_cast(nullptr), + d_keys_char, + nrows, + ncols, + nkeys, + d_sums, + stream); + } + + }; // end namespace detail + }; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh new file mode 100644 index 0000000000..700ce43735 --- /dev/null +++ b/cpp/include/raft/linalg/detail/rsvd.cuh @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { + namespace linalg { + namespace detail { + +/** + * @brief randomized singular value decomposition (RSVD) on the column major + * float type input matrix (Jacobi-based), by specifying no. of PCs and + * upsamples directly + * @param handle: raft handle + * @param M: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param S_vec: singular values of input matrix + * @param U: left singular values of input matrix + * @param V: right singular values of input matrix + * @param k: no. of singular values to be computed + * @param p: no. of upsamples + * @param use_bbt: whether use eigen decomposition in computation or not + * @param gen_left_vec: left vector needs to be generated or not? + * @param gen_right_vec: right vector needs to be generated or not? + * @param use_jacobi: whether to jacobi solver for decomposition + * @param tol: tolerance for Jacobi-based solvers + * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers + * @param stream cuda stream + */ + template + void rsvdFixedRank(const raft::handle_t &handle, + math_t *M, + int n_rows, + int n_cols, + math_t *S_vec, + math_t *U, + math_t *V, + int k, + int p, + bool use_bbt, + bool gen_left_vec, + bool gen_right_vec, + bool use_jacobi, + math_t tol, + int max_sweeps, + cudaStream_t stream) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); + + // All the notations are following Algorithm 4 & 5 in S. Voronin's paper: + // https://arxiv.org/abs/1502.05366 + + int m = n_rows, n = n_cols; + int l = k + p; // Total number of singular values to be computed before truncation + int q = 2; // Number of power sampling counts + int s = 1; // Frequency controller for QR decomposition during power sampling + // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s + // > 2: less frequent QR + + const math_t alpha = 1.0, beta = 0.0; + + // Build temporary U, S, V matrices + rmm::device_uvector S_vec_tmp(l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream)); + + // build random matrix + rmm::device_uvector RN(n * l, stream); + raft::random::Rng rng(484); + rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream); + + // multiply to get matrix of random samples Y + rmm::device_uvector Y(m * l, stream); + raft::linalg::gemm( + handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + + // now build up (M M^T)^q R + rmm::device_uvector Z(n * l, stream); + rmm::device_uvector Yorth(m * l, stream); + rmm::device_uvector Zorth(n * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream)); + + // power sampling scheme + for (int j = 1; j < q; j++) { + if ((2 * j - 2) % s == 0) { + raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream); + raft::linalg::gemm(handle, + M, + m, + n, + Yorth.data(), + Z.data(), + n, + l, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, + stream); + } else { + raft::linalg::gemm( + handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, + stream); + } + + if ((2 * j - 1) % s == 0) { + raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream); + raft::linalg::gemm(handle, + M, + m, + n, + Zorth.data(), + Y.data(), + m, + l, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + } else { + raft::linalg::gemm( + handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, + stream); + } + } + + // orthogonalize on exit from loop to get Q + rmm::device_uvector Q(m * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); + raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream); + + // either QR of B^T method, or eigendecompose BB^T method + if (!use_bbt) { + // form Bt = Mt*Q : nxm * mxl = nxl + rmm::device_uvector Bt(n * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream)); + raft::linalg::gemm( + handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + + // compute QR factorization of Bt + // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */ + rmm::device_uvector Qhat(n * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream)); + rmm::device_uvector Rhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream)); + raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream); + + // compute SVD of Rhat (lxl) + rmm::device_uvector Uhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + rmm::device_uvector Vhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream)); + if (use_jacobi) + raft::linalg::svdJacobi(handle, + Rhat.data(), + l, + l, + S_vec_tmp.data(), + Uhat.data(), + Vhat.data(), + true, + true, + tol, + max_sweeps, + stream); + else + raft::linalg::svdQR(handle, + Rhat.data(), + l, + l, + S_vec_tmp.data(), + Uhat.data(), + Vhat.data(), + true, + true, + true, + stream); + raft::matrix::sliceMatrix(S_vec_tmp.data(), + 1, + l, + S_vec, + 0, + 0, + 1, + k, + stream); // First k elements of S_vec + + // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk + if (gen_left_vec) { + raft::linalg::gemm(handle, + Q.data(), + m, + l, + Vhat.data(), + U, + m, + k /*used to be l and needs slicing*/, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + } + + // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk + if (gen_right_vec) { + raft::linalg::gemm(handle, + Qhat.data(), + n, + l, + Uhat.data(), + V, + n, + k /*used to be l and needs slicing*/, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + } + } else { + // build the matrix B B^T = Q^T M M^T Q column by column + // Bt = M^T Q ; nxm * mxk = nxk + rmm::device_uvector B(n * l, stream); + raft::linalg::gemm( + handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + + rmm::device_uvector BBt(l * l, stream); + raft::linalg::gemm(handle, + B.data(), + l, + n, + B.data(), + BBt.data(), + l, + l, + CUBLAS_OP_N, + CUBLAS_OP_T, + alpha, + beta, + stream); + + // compute eigendecomposition of BBt + rmm::device_uvector Uhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + rmm::device_uvector Uhat_dup(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream)); + raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream); + if (use_jacobi) + raft::linalg::eigJacobi( + handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps); + else + raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream); + raft::matrix::seqRoot(S_vec_tmp.data(), l, stream); + raft::matrix::sliceMatrix(S_vec_tmp.data(), + 1, + l, + S_vec, + 0, + p, + 1, + l, + stream); // Last k elements of S_vec + raft::matrix::colReverse(S_vec, 1, k, stream); + + // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk + if (gen_left_vec) { + raft::linalg::gemm(handle, + Q.data(), + m, + l, + Uhat.data() + p * l, + U, + m, + k, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::colReverse(U, m, k, stream); + } + + // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] * + // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk + if (gen_right_vec) { + rmm::device_uvector Sinv(k * k, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream)); + rmm::device_uvector UhatSinv(l * k, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream)); + raft::matrix::reciprocal(S_vec_tmp.data(), l, stream); + raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream); + + raft::linalg::gemm(handle, + Uhat.data() + p * l, + l, + k, + Sinv.data(), + UhatSinv.data(), + l, + k, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::linalg::gemm(handle, + B.data(), + l, + n, + UhatSinv.data(), + V, + n, + k, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::colReverse(V, n, k, stream); + } + } + } + +/** + * @brief randomized singular value decomposition (RSVD) on the column major + * float type input matrix (Jacobi-based), by specifying the PC and upsampling + * ratio + * @param handle: raft handle + * @param M: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param S_vec: singular values of input matrix + * @param U: left singular values of input matrix + * @param V: right singular values of input matrix + * @param PC_perc: percentage of singular values to be computed + * @param UpS_perc: upsampling percentage + * @param use_bbt: whether use eigen decomposition in computation or not + * @param gen_left_vec: left vector needs to be generated or not? + * @param gen_right_vec: right vector needs to be generated or not? + * @param use_jacobi: whether to jacobi solver for decomposition + * @param tol: tolerance for Jacobi-based solvers + * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers + * @param stream cuda stream + */ + template + void rsvdPerc(const raft::handle_t &handle, + math_t *M, + int n_rows, + int n_cols, + math_t *S_vec, + math_t *U, + math_t *V, + math_t PC_perc, + math_t UpS_perc, + bool use_bbt, + bool gen_left_vec, + bool gen_right_vec, + bool use_jacobi, + math_t tol, + int max_sweeps, + cudaStream_t stream) { + int k = max((int) (min(n_rows, n_cols) * PC_perc), + 1); // Number of singular values to be computed + int p = max((int) (min(n_rows, n_cols) * UpS_perc), 1); // Upsamples + rsvdFixedRank(handle, + M, + n_rows, + n_cols, + S_vec, + U, + V, + k, + p, + use_bbt, + gen_left_vec, + gen_right_vec, + use_jacobi, + tol, + max_sweeps, + stream); + } + + }; // end namespace detail + }; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh new file mode 100644 index 0000000000..935ffed190 --- /dev/null +++ b/cpp/include/raft/linalg/detail/ternary_op.cuh @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { + namespace linalg { +namespace detail { + template + __global__ void ternaryOpKernel( + math_t *out, const math_t *in1, const math_t *in2, const math_t *in3, IdxType len, Lambda op) { + typedef raft::TxN_t VecType; + VecType a, b, c; + IdxType idx = threadIdx.x + ((IdxType) blockIdx.x * blockDim.x); + idx *= VecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); + c.load(in3, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) { + a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]); + } + a.store(out, idx); + } + + template + void ternaryOpImpl(math_t *out, + const math_t *in1, + const math_t *in2, + const math_t *in3, + IdxType len, + Lambda op, + cudaStream_t stream) { + const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType) TPB); + ternaryOpKernel + <<>>(out, in1, in2, in3, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + +/** + * @brief perform element-wise ternary operation on the input arrays + * @tparam math_t data-type upon which the math operation will be performed + * @tparam Lambda the device-lambda performing the actual operation + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in1 the first input array + * @param in2 the second input array + * @param in3 the third input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + */ + template + void ternaryOp(math_t *out, + const math_t *in1, + const math_t *in2, + const math_t *in3, + IdxType len, + Lambda op, + cudaStream_t stream) { + size_t bytes = len * sizeof(math_t); + if (16 / sizeof(math_t) && bytes % 16 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (8 / sizeof(math_t) && bytes % 8 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (4 / sizeof(math_t) && bytes % 4 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (2 / sizeof(math_t) && bytes % 2 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (1 / sizeof(math_t)) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else { + ternaryOpImpl(out, in1, in2, in3, len, op, stream); + } + } + +}; // end namespace detail + }; // end namespace linalg +}; // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp new file mode 100644 index 0000000000..bfa302eb4f --- /dev/null +++ b/cpp/include/raft/linalg/lstsq.hpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +namespace raft { +namespace linalg { + +/** Solves the linear ordinary least squares problem `Aw = b` + * Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine. + * + * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, + * so it's not guaranteed to stay unmodified. + */ +template +void lstsqSvdQR(const raft::handle_t& handle, + math_t* A, + const int n_rows, + const int n_cols, + const math_t* b, + math_t* w, + cudaStream_t stream) +{ + detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream); +} + +/** Solves the linear ordinary least squares problem `Aw = b` + * Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER). + * + * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, + * so it's not guaranteed to stay unmodified. + */ +template +void lstsqSvdJacobi(const raft::handle_t& handle, + math_t* A, + const int n_rows, + const int n_cols, + const math_t* b, + math_t* w, + cudaStream_t stream) +{ + detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream); +} + +/** Solves the linear ordinary least squares problem `Aw = b` + * via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A). + * (`w = (A^T A)^-1 A^T b`) + */ +template +void lstsqEig(const raft::handle_t& handle, + const math_t* A, + const int n_rows, + const int n_cols, + const math_t* b, + math_t* w, + cudaStream_t stream) +{ + detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream); +} + +/** Solves the linear ordinary least squares problem `Aw = b` + * via QR decomposition of `A = QR`. + * (triangular system of equations `Rw = Q^T b`) + * + * @param A[in/out] - input feature matrix. + * Warning: the content of this matrix is modified by the cuSOLVER routines. + * @param b[in/out] - input target vector. + * Warning: the content of this vector is modified by the cuSOLVER routines. + */ +template +void lstsqQR(const raft::handle_t& handle, + math_t* A, + const int n_rows, + const int n_cols, + math_t* b, + math_t* w, + cudaStream_t stream) +{ + detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream); +} + +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh new file mode 100644 index 0000000000..1a39d4b3ba --- /dev/null +++ b/cpp/include/raft/linalg/power.cuh @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { + namespace linalg { + +/** + * @defgroup ScalarOps Scalar operations on the input buffer + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in the input buffer + * @param scalar the scalar used in the operations + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + * @{ + */ +template +void powerScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cudaStream_t stream) { + raft::linalg::unaryOp( + out, in, len,[scalar] __device__(math_t + in) { return raft::myPow(in, scalar); }, stream); +} +/** @} */ + +/** + * @defgroup BinaryOps Element-wise binary operations on the input buffers + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in1 the first input buffer + * @param in2 the second input buffer + * @param len number of elements in the input buffers + * @param stream cuda stream where to launch work + * @{ + */ +template +void power(math_t *out, const math_t *in1, const math_t *in2, IdxType len, cudaStream_t stream) { + raft::linalg::binaryOp( + out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream); +} +/** @} */ + + }; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh new file mode 100644 index 0000000000..c6e163d491 --- /dev/null +++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { + + +/** + * @brief Computes the sum-reduction of matrix columns for each given key + * @tparam T the input data type (as well as the output reduced matrix) + * @tparam KeyType data type of the keys + * @tparam IdxType indexing arithmetic type + * @param data the input data (dim = nrows x ncols). This is assumed to be in + * row-major layout + * @param keys keys array (len = ncols). It is assumed that each key in this + * array is between [0, nkeys). In case this is not true, the caller is expected + * to have called make_monotonic primitive to prepare such a contiguous and + * monotonically increasing keys array. + * @param out the output reduced matrix along columns (dim = nrows x nkeys). + * This will be assumed to be in row-major layout + * @param nrows number of rows in the input data + * @param ncols number of colums in the input data + * @param nkeys number of unique keys in the keys array + * @param stream cuda stream to launch the kernel onto + */ +template +void reduce_cols_by_key(const T *data, + const KeyIteratorT keys, + T *out, + IdxType nrows, + IdxType ncols, + IdxType nkeys, + cudaStream_t stream) { + detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream); +} +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh new file mode 100644 index 0000000000..3b5345a540 --- /dev/null +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +/** + * @brief Computes the weighted reduction of matrix rows for each given key + * + * @tparam DataIteratorT Random-access iterator type, for reading input matrix + * (may be a simple pointer type) + * @tparam KeysIteratorT Random-access iterator type, for reading input keys + * (may be a simple pointer type) + * + * @param[in] d_A Input data array (lda x nrows) + * @param[in] lda Real row size for input data, d_A + * @param[in] d_keys Keys for each row (1 x nrows) + * @param[in] d_weights Weights for each observation in d_A (1 x nrows) + * @param[out] d_keys_char Scratch memory for conversion of keys to char + * @param[in] nrows Number of rows in d_A and d_keys + * @param[in] ncols Number of data columns in d_A + * @param[in] nkeys Number of unique keys in d_keys + * @param[out] d_sums Row sums by key (ncols x d_keys) + * @param[in] stream CUDA stream + */ +template +void reduce_rows_by_key(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT *d_weights, + char *d_keys_char, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t stream) { + detail::reduce_rows_by_key(d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream); +} + +/** + * @brief Computes the reduction of matrix rows for each given key + * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple + * pointer type) + * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple + * pointer type) + * @param[in] d_A Input data array (lda x nrows) + * @param[in] lda Real row size for input data, d_A + * @param[in] d_keys Keys for each row (1 x nrows) + * @param d_keys_char Scratch memory for conversion of keys to char + * @param[in] nrows Number of rows in d_A and d_keys + * @param[in] ncols Number of data columns in d_A + * @param[in] nkeys Number of unique keys in d_keys + * @param[out] d_sums Row sums by key (ncols x d_keys) + * @param[in] stream CUDA stream + */ +template +void reduce_rows_by_key(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + char *d_keys_char, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t stream) { + typedef typename std::iterator_traits::value_type DataType; + reduce_rows_by_key(d_A, + lda, + d_keys, + static_cast(nullptr), + d_keys_char, + nrows, + ncols, + nkeys, + d_sums, + stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh new file mode 100644 index 0000000000..e789abce30 --- /dev/null +++ b/cpp/include/raft/linalg/rsvd.cuh @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { + +/** + * @brief randomized singular value decomposition (RSVD) on the column major + * float type input matrix (Jacobi-based), by specifying no. of PCs and + * upsamples directly + * @param handle: raft handle + * @param M: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param S_vec: singular values of input matrix + * @param U: left singular values of input matrix + * @param V: right singular values of input matrix + * @param k: no. of singular values to be computed + * @param p: no. of upsamples + * @param use_bbt: whether use eigen decomposition in computation or not + * @param gen_left_vec: left vector needs to be generated or not? + * @param gen_right_vec: right vector needs to be generated or not? + * @param use_jacobi: whether to jacobi solver for decomposition + * @param tol: tolerance for Jacobi-based solvers + * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers + * @param stream cuda stream + */ +template +void rsvdFixedRank(const raft::handle_t &handle, + math_t *M, + int n_rows, + int n_cols, + math_t *S_vec, + math_t *U, + math_t *V, + int k, + int p, + bool use_bbt, + bool gen_left_vec, + bool gen_right_vec, + bool use_jacobi, + math_t tol, + int max_sweeps, + cudaStream_t stream) { + + detail::rsvdFixedRank(handle, M, n_rows, n_cols, S_vec, U, V, k, p, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream); +} + + +/** + * @brief randomized singular value decomposition (RSVD) on the column major + * float type input matrix (Jacobi-based), by specifying the PC and upsampling + * ratio + * @param handle: raft handle + * @param M: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param S_vec: singular values of input matrix + * @param U: left singular values of input matrix + * @param V: right singular values of input matrix + * @param PC_perc: percentage of singular values to be computed + * @param UpS_perc: upsampling percentage + * @param use_bbt: whether use eigen decomposition in computation or not + * @param gen_left_vec: left vector needs to be generated or not? + * @param gen_right_vec: right vector needs to be generated or not? + * @param use_jacobi: whether to jacobi solver for decomposition + * @param tol: tolerance for Jacobi-based solvers + * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers + * @param stream cuda stream + */ +template +void rsvdPerc(const raft::handle_t &handle, + math_t *M, + int n_rows, + int n_cols, + math_t *S_vec, + math_t *U, + math_t *V, + math_t PC_perc, + math_t UpS_perc, + bool use_bbt, + bool gen_left_vec, + bool gen_right_vec, + bool use_jacobi, + math_t tol, + int max_sweeps, + cudaStream_t stream) { + detail::rsvdPerc(handle, M, n_rows, n_cols, S_vec, U, V, PC_perc, UpS_perc, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh new file mode 100644 index 0000000000..49eb6788ef --- /dev/null +++ b/cpp/include/raft/linalg/sqrt.cuh @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { + +/** + * @defgroup ScalarOps Scalar operations on the input buffer + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out the output buffer + * @param in the input buffer + * @param len number of elements in the input buffer + * @param stream cuda stream where to launch work + * @{ + */ +template +void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp( + out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream); +} +/** @} */ + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh new file mode 100644 index 0000000000..99e21fd5a0 --- /dev/null +++ b/cpp/include/raft/linalg/ternary_op.cuh @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +/** + * @brief perform element-wise ternary operation on the input arrays + * @tparam math_t data-type upon which the math operation will be performed + * @tparam Lambda the device-lambda performing the actual operation + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in1 the first input array + * @param in2 the second input array + * @param in3 the third input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + */ +template +void ternaryOp(math_t *out, + const math_t *in1, + const math_t *in2, + const math_t *in3, + IdxType len, + Lambda op, + cudaStream_t stream) { + detail::ternaryOp(out, in1, in2, in3, len, op, stream); +} + +}; // end namespace linalg +}; // end namespace raft \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index fda60e1cb0..c0db20f650 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -56,10 +56,15 @@ add_executable(test_raft test/linalg/matrix_vector_op.cu test/linalg/multiply.cu test/linalg/norm.cu + test/linalg/power.cu test/linalg/reduce.cu + test/linalg/reduce_cols_by_key.cu + test/linalg/rsvd.cu + test/linalg/sqrt.cu test/linalg/strided_reduction.cu test/linalg/subtract.cu test/linalg/svd.cu + test/linalg/ternary_op.cu test/linalg/transpose.cu test/linalg/unary_op.cu test/matrix/math.cu diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu new file mode 100644 index 0000000000..8f336d583f --- /dev/null +++ b/cpp/test/linalg/power.cu @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include + +namespace linalg { + + template + __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); } + } + + template + void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) + { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naivePowerElemKernel<<>>(out, in1, in2, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + template + __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); } + } + + template + void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) + { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naivePowerScalarKernel<<>>(out, in1, in2, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + template + struct PowerInputs { + T tolerance; + int len; + unsigned long long int seed; + }; + + template + ::std::ostream& operator<<(::std::ostream& os, const PowerInputs& dims) + { + return os; + } + + template + class PowerTest : public ::testing::TestWithParam> { + protected: + PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + + in1.resize(len, stream); + in2.resize(len, stream); + out_ref.resize(len, stream); + out.resize(len, stream); + r.uniform(in1.data(), len, T(1.0), T(2.0), stream); + r.uniform(in2.data(), len, T(1.0), T(2.0), stream); + + naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream); + naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream); + + power(out.data(), in1.data(), in2.data(), len, stream); + powerScalar(out.data(), out.data(), T(2), len, stream); + power(in1.data(), in1.data(), in2.data(), len, stream); + powerScalar(in1.data(), in1.data(), T(2), len, stream); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + protected: + cudaStream_t stream = 0; + PowerInputs params; + rmm::device_uvector in1, in2, out_ref, out; + int device_count = 0; + }; + + const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; + + const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; + + typedef PowerTest PowerTestF; + TEST_P(PowerTestF, Result) + { + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); + } + + typedef PowerTest PowerTestD; + TEST_P(PowerTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestD, ::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu new file mode 100644 index 0000000000..55057b4894 --- /dev/null +++ b/cpp/test/linalg/reduce_cols_by_key.cu @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include + +namespace raft { + namespace linalg { + + template + void naiveReduceColsByKey(const T* in, + const uint32_t* keys, + T* out_ref, + uint32_t nrows, + uint32_t ncols, + uint32_t nkeys, + cudaStream_t stream) + { + std::vector h_keys(ncols, 0u); + raft::copy(&(h_keys[0]), keys, ncols, stream); + std::vector h_in(nrows * ncols); + raft::copy(&(h_in[0]), in, nrows * ncols, stream); + raft::interruptible::synchronize(stream); + std::vector out(nrows * nkeys, T(0)); + for (uint32_t i = 0; i < nrows; ++i) { + for (uint32_t j = 0; j < ncols; ++j) { + out[i * nkeys + h_keys[j]] += h_in[i * ncols + j]; + } + } + raft::copy(out_ref, &(out[0]), nrows * nkeys, stream); + raft::interruptible::synchronize(stream); + } + + template + struct ReduceColsInputs { + T tolerance; + uint32_t rows; + uint32_t cols; + uint32_t nkeys; + unsigned long long int seed; + }; + + template + ::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs& dims) + { + return os; + } + + template + class ReduceColsTest : public ::testing::TestWithParam> { + protected: + ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + auto nrows = params.rows; + auto ncols = params.cols; + auto nkeys = params.nkeys; + in.resize(nrows * ncols, stream); + keys.resize(ncols, stream); + out_ref.resize(nrows * nkeys, stream); + out.resize(nrows * nkeys, stream); + r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream); + r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream); + naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream); + reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream); + raft::interruptible::synchronize(stream); + } + + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + + protected: + cudaStream_t stream = 0; + ReduceColsInputs params; + rmm::device_uvector in, out_ref, out; + rmm::device_uvector keys; + }; + + const std::vector> inputsf = {{0.0001f, 128, 32, 6, 1234ULL}, + {0.0005f, 121, 63, 10, 1234ULL}}; + typedef ReduceColsTest ReduceColsTestF; + TEST_P(ReduceColsTestF, Result) + { + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.rows * params.nkeys, + raft::CompareApprox(params.tolerance))); + } + INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf)); + + const std::vector> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL}, + {0.0000001, 121, 63, 10, 1234ULL}}; + typedef ReduceColsTest ReduceColsTestD; + TEST_P(ReduceColsTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.rows * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestD, ::testing::ValuesIn(inputsd2)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu new file mode 100644 index 0000000000..e6dc8cef7f --- /dev/null +++ b/cpp/test/linalg/reduce_rows_by_key.cu @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include + +namespace raft { + namespace linalg { + + template + __global__ void naiveReduceRowsByKeyKernel(const Type* d_A, + int lda, + uint32_t* d_keys, + const Type* d_weight, + char* d_char_keys, + int nrows, + int ncols, + int nkeys, + Type* d_sums) + { + int c = threadIdx.x + blockIdx.x * blockDim.x; + if (c >= ncols) return; + int this_key = threadIdx.y + blockIdx.y * blockDim.y; + + Type sum = 0.0; + for (int r = 0; r < nrows; r++) { + if (this_key != d_keys[r]) continue; + Type wt = 1; + if (d_weight) wt = d_weight[r]; + sum += d_A[lda * r + c] * wt; + } + d_sums[this_key * ncols + c] = sum; + } + template + void naiveReduceRowsByKey(const Type* d_A, + int lda, + uint32_t* d_keys, + const Type* d_weight, + char* d_char_keys, + int nrows, + int ncols, + int nkeys, + Type* d_sums, + cudaStream_t stream) + { + cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols); + + naiveReduceRowsByKeyKernel<<>>( + d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums); + } + + template + struct ReduceRowsInputs { + T tolerance; + int nobs; + uint32_t cols; + uint32_t nkeys; + unsigned long long int seed; + bool weighted; + T max_weight; + }; + + template + ::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs& dims) + { + return os; + } + + template + class ReduceRowTest : public ::testing::TestWithParam> { + public: + ReduceRowTest() + : params(::testing::TestWithParam>::GetParam()), + stream(handle.get_stream()), + in(params.nobs * params.cols, stream), + out(params.nkeys * params.cols, stream), + out_ref(params.nkeys * params.cols, stream), + keys(params.nobs, stream), + scratch_buf(params.nobs, stream) + { + } + + protected: + void SetUp() override + { + raft::random::Rng r(params.seed); + raft::random::Rng r_int(params.seed); + + int nobs = params.nobs; + uint32_t cols = params.cols; + uint32_t nkeys = params.nkeys; + r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream); + r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream); + + rmm::device_uvector weight(0, stream); + if (params.weighted) { + weight.resize(nobs, stream); + raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox); + r.uniform(weight.data(), nobs, T(1), params.max_weight, stream); + } + + naiveReduceRowsByKey(in.data(), + cols, + keys.data(), + params.weighted ? weight.data() : nullptr, + scratch_buf.data(), + nobs, + cols, + nkeys, + out_ref.data(), + stream); + if (params.weighted) { + reduce_rows_by_key(in.data(), + cols, + keys.data(), + params.weighted ? weight.data() : nullptr, + scratch_buf.data(), + nobs, + cols, + nkeys, + out.data(), + stream); + } else { + reduce_rows_by_key( + in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream); + } + handle.sync_stream(stream); + } + + protected: + ReduceRowsInputs params; + raft::handle_t handle; + cudaStream_t stream = 0; + + int device_count = 0; + rmm::device_uvector in, out, out_ref; + rmm::device_uvector keys; + rmm::device_uvector scratch_buf; + }; + +// ReduceRowTestF +// 128 Obs, 32 cols, 6 clusters + const std::vector> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false}, + {0.000001f, 128, 32, 6, 1234ULL, true, 1.0}, + {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}}; + typedef ReduceRowTest ReduceRowTestF; + TEST_P(ReduceRowTestF, Result) + { + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); + } + INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2)); + +// ReduceRowTestD +// 128 Obs, 32 cols, 6 clusters, double precision + const std::vector> inputsd2 = { + {0.00000001, 128, 32, 6, 1234ULL, false}, + {0.00000001, 128, 32, 6, 1234ULL, true, 2.0}, + {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}}; + typedef ReduceRowTest ReduceRowTestD; + TEST_P(ReduceRowTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestD, ::testing::ValuesIn(inputsd2)); + +// ReduceRowTestSmallnKey +// 128 Obs, 32 cols, 3 clusters +const std::vector> inputsf_small_nkey = { + {0.000001f, 128, 32, 3, 1234ULL, false}, + {0.000001f, 128, 32, 3, 1234ULL, true, 5.0}, + {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}}; +typedef ReduceRowTest ReduceRowTestSmallnKey; +TEST_P(ReduceRowTestSmallnKey, Result) +{ +ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceRowTests, + ReduceRowTestSmallnKey, + ::testing::ValuesIn(inputsf_small_nkey)); + +// ReduceRowTestBigSpace +// 512 Obs, 1024 cols, 32 clusters, double precision +const std::vector> inputsd_big_space = { + {0.00000001, 512, 1024, 40, 1234ULL, false}, + {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0}, + {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}}; +typedef ReduceRowTest ReduceRowTestBigSpace; +TEST_P(ReduceRowTestBigSpace, Result) +{ +ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceRowTests, + ReduceRowTestBigSpace, + ::testing::ValuesIn(inputsd_big_space)); + +// ReduceRowTestManyObs +// 100000 Obs, 37 cols, 32 clusters +const std::vector> inputsf_many_obs = { + {0.00001f, 100000, 37, 32, 1234ULL, false}, + {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0}, + {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}}; +typedef ReduceRowTest ReduceRowTestManyObs; +TEST_P(ReduceRowTestManyObs, Result) +{ +ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceRowTests, + ReduceRowTestManyObs, + ::testing::ValuesIn(inputsf_many_obs)); + +// ReduceRowTestManyClusters +// 100000 Obs, 37 cols, 2048 clusters +const std::vector> inputsf_many_cluster = { + {0.00001f, 100000, 37, 2048, 1234ULL, false}, + {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0}, + {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}}; +typedef ReduceRowTest ReduceRowTestManyClusters; +TEST_P(ReduceRowTestManyClusters, Result) +{ +ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceRowTests, + ReduceRowTestManyClusters, + ::testing::ValuesIn(inputsf_many_cluster)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu new file mode 100644 index 0000000000..260ea07268 --- /dev/null +++ b/cpp/test/linalg/rsvd.cu @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include +#include +#include + +namespace raft { + namespace linalg { + + template + struct RsvdInputs { + T tolerance; + int n_row; + int n_col; + T PC_perc; + T UpS_perc; + int k; + int p; + bool use_bbt; + unsigned long long int seed; + }; + + template + ::std::ostream& operator<<(::std::ostream& os, const RsvdInputs& dims) + { + return os; + } + + template + class RsvdTest : public ::testing::TestWithParam> { + protected: + RsvdTest() + : A(0, stream), + U(0, stream), + S(0, stream), + V(0, stream), + left_eig_vectors_ref(0, stream), + right_eig_vectors_ref(0, stream), + sing_vals_ref(0, stream) + { + } + + void SetUp() override + { + raft::handle_t handle; + stream = handle.get_stream(); + + params = ::testing::TestWithParam>::GetParam(); + // rSVD seems to be very sensitive to the random number sequence as well! + raft::random::Rng r(params.seed, raft::random::GenTaps); + int m = params.n_row, n = params.n_col; + T eig_svd_tol = 1.e-7; + int max_sweeps = 100; + + T mu = 0.0, sigma = 1.0; + A.resize(m * n, stream); + if (params.tolerance > 1) { // Sanity check + ASSERT(m == 3, "This test only supports mxn=3x2!"); + ASSERT(m * n == 6, "This test only supports mxn=3x2!"); + T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; + raft::update_device(A.data(), data_h, m * n, stream); + + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695}; + T right_eig_vectors_ref_h[] = {-0.638636, -0.769509}; + T sing_vals_ref_h[] = {7.065283}; + + left_eig_vectors_ref.resize(m, stream); + right_eig_vectors_ref.resize(n, stream); + sing_vals_ref.resize(1, stream); + + raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream); + raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream); + raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream); + + } else { // Other normal tests + r.normal(A.data(), m * n, mu, sigma, stream); + } + std::vector A_backup_cpu(m * + n); // Backup A matrix as svdJacobi will destroy the content of A + raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream); + + if (params.k == 0) { + params.k = max((int)(min(m, n) * params.PC_perc), 1); + params.p = max((int)(min(m, n) * params.UpS_perc), 1); + } + + U.resize(m * params.k, stream); + S.resize(params.k, stream); + V.resize(n * params.k, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream)); + + // RSVD tests + if (params.k == 0) { // Test with PC and upsampling ratio + rsvdPerc(handle, + A.data(), + m, + n, + S.data(), + U.data(), + V.data(), + params.PC_perc, + params.UpS_perc, + params.use_bbt, + true, + true, + false, + eig_svd_tol, + max_sweeps, + stream); + } else { // Test with directly given fixed rank + rsvdFixedRank(handle, + A.data(), + m, + n, + S.data(), + U.data(), + V.data(), + params.k, + params.p, + params.use_bbt, + true, + true, + true, + eig_svd_tol, + max_sweeps, + stream); + } + raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream); + } + + protected: + cudaStream_t stream = 0; + RsvdInputs params; + rmm::device_uvector A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; + }; + + const std::vector> inputs_fx = { + // Test with ratios + {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT + {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT + {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT + }; + + const std::vector> inputs_dx = { + // Test with ratios + {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT + {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT + {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT + }; + + const std::vector> sanity_inputs_fx = { + {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; + + const std::vector> sanity_inputs_dx = { + {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL}, + {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}}; + + typedef RsvdTest RsvdSanityCheckValF; + TEST_P(RsvdSanityCheckValF, Result) + { + ASSERT_TRUE(devArrMatch( + sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs(params.tolerance))); + } + + typedef RsvdTest RsvdSanityCheckValD; + TEST_P(RsvdSanityCheckValD, Result) +{ + ASSERT_TRUE(devArrMatch( + sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs(params.tolerance))); +} + +typedef RsvdTest RsvdSanityCheckLeftVecF; +TEST_P(RsvdSanityCheckLeftVecF, Result) +{ +ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(), + U.data(), + params.n_row * params.k, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef RsvdTest RsvdSanityCheckLeftVecD; +TEST_P(RsvdSanityCheckLeftVecD, Result) +{ +ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(), + U.data(), + params.n_row * params.k, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef RsvdTest RsvdSanityCheckRightVecF; +TEST_P(RsvdSanityCheckRightVecF, Result) +{ +ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(), + V.data(), + params.n_col * params.k, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef RsvdTest RsvdSanityCheckRightVecD; +TEST_P(RsvdSanityCheckRightVecD, Result) +{ +ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(), + V.data(), + params.n_col * params.k, + raft::CompareApproxAbs(params.tolerance))); +} + +typedef RsvdTest RsvdTestSquareMatrixNormF; +TEST_P(RsvdTestSquareMatrixNormF, Result) +{ +raft::handle_t handle; + +ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle, + A.data(), + U.data(), + S.data(), + V.data(), + params.n_row, + params.n_col, + params.k, + 4 * params.tolerance, + handle.get_stream())); +} + +typedef RsvdTest RsvdTestSquareMatrixNormD; +TEST_P(RsvdTestSquareMatrixNormD, Result) +{ +raft::handle_t handle; + +ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle, + A.data(), + U.data(), + S.data(), + V.data(), + params.n_row, + params.n_col, + params.k, + 4 * params.tolerance, + handle.get_stream())); +} + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValF, ::testing::ValuesIn(sanity_inputs_fx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValD, ::testing::ValuesIn(sanity_inputs_dx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckLeftVecF, ::testing::ValuesIn(sanity_inputs_fx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckLeftVecD, ::testing::ValuesIn(sanity_inputs_dx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckRightVecF, ::testing::ValuesIn(sanity_inputs_fx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckRightVecD, ::testing::ValuesIn(sanity_inputs_dx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormF, ::testing::ValuesIn(inputs_fx)); + +INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormD, ::testing::ValuesIn(inputs_dx)); + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu new file mode 100644 index 0000000000..bf64d264ad --- /dev/null +++ b/cpp/test/linalg/sqrt.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include + +namespace raft { + namespace linalg { + + template + __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); } + } + + template + void naiveSqrtElem(Type* out, const Type* in1, int len) + { + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveSqrtElemKernel<<>>(out, in1, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + template + struct SqrtInputs { + T tolerance; + int len; + unsigned long long int seed; + }; + + template + ::std::ostream& operator<<(::std::ostream& os, const SqrtInputs& dims) + { + return os; + } + + template + class SqrtTest : public ::testing::TestWithParam> { + protected: + SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + int len = params.len; + in1.resize(len, stream); + out_ref.resize(len, stream); + out.resize(len, stream); + r.uniform(in1.data(), len, T(1.0), T(2.0), stream); + + naiveSqrtElem(out_ref.data(), in1.data(), len); + + sqrt(out.data(), in1.data(), len, stream); + sqrt(in1.data(), in1.data(), len, stream); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + protected: + cudaStream_t stream = 0; + SqrtInputs params; + rmm::device_uvector in1, out_ref, out; + int device_count = 0; + }; + + const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; + + const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; + + typedef SqrtTest SqrtTestF; + TEST_P(SqrtTestF, Result) + { + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); + } + + typedef SqrtTest SqrtTestD; + TEST_P(SqrtTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestF, ::testing::ValuesIn(inputsf2)); + +INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestD, ::testing::ValuesIn(inputsd2)); + +} // end namespace LinAlg +} // end namespace MLCommon diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu new file mode 100644 index 0000000000..83ec3e6029 --- /dev/null +++ b/cpp/test/linalg/ternary_op.cu @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include + +namespace raft { + namespace linalg { + + template + struct BinaryOpInputs { + InType tolerance; + IdxType len; + unsigned long long int seed; + }; + + template + ::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) + { + return os; + } + + template + class ternaryOpTest : public ::testing::TestWithParam> { + public: + ternaryOpTest() + : params(::testing::TestWithParam>::GetParam()), + stream(handle.get_stream()), + out_add_ref(params.len, stream), + out_add(params.len, stream), + out_mul_ref(params.len, stream), + out_mul(params.len, stream) + { + } + + void SetUp() override + { + raft::random::Rng rng(params.seed); + int len = params.len; + rmm::device_uvector in1(len, stream); + rmm::device_uvector in2(len, stream); + rmm::device_uvector in3(len, stream); + + rng.fill(out_add_ref.data(), len, T(6.0), stream); + rng.fill(out_mul_ref.data(), len, T(6.0), stream); + rng.fill(in1.data(), len, T(1.0), stream); + rng.fill(in2.data(), len, T(2.0), stream); + rng.fill(in3.data(), len, T(3.0), stream); + + auto add = [] __device__(T a, T b, T c) { return a + b + c; }; + auto mul = [] __device__(T a, T b, T c) { return a * b * c; }; + ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream); + ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream); + } + + protected: + BinaryOpInputs params; + raft::handle_t handle; + cudaStream_t stream = 0; + + rmm::device_uvector out_add_ref, out_add, out_mul_ref, out_mul; + }; + + const std::vector> inputsf = {{0.000001f, 1024 * 1024, 1234ULL}, + {0.000001f, 1024 * 1024 + 2, 1234ULL}, + {0.000001f, 1024 * 1024 + 1, 1234ULL}}; + typedef ternaryOpTest ternaryOpTestF; + TEST_P(ternaryOpTestF, Result) + { + ASSERT_TRUE(devArrMatch( + out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox(params.tolerance))); + } + INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf)); + + const std::vector> inputsd = {{0.00000001, 1024 * 1024, 1234ULL}, + {0.00000001, 1024 * 1024 + 2, 1234ULL}, + {0.00000001, 1024 * 1024 + 1, 1234ULL}}; + typedef ternaryOpTest ternaryOpTestD; + TEST_P(ternaryOpTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace linalg +} // end namespace raft From 6a0d70c48aba6ae36234b84c92dca3fc789dc47d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 18:53:04 -0500 Subject: [PATCH 02/24] Fixing style --- cpp/include/raft/linalg/detail/lstsq.hpp | 740 +++++++++--------- .../raft/linalg/detail/reduce_cols_by_key.cuh | 64 +- .../raft/linalg/detail/reduce_rows_by_key.cuh | 647 +++++++-------- cpp/include/raft/linalg/detail/rsvd.cuh | 634 +++++++-------- cpp/include/raft/linalg/detail/ternary_op.cuh | 121 +-- cpp/include/raft/linalg/lstsq.hpp | 8 +- cpp/include/raft/linalg/power.cuh | 23 +- .../raft/linalg/reduce_cols_by_key.cuh | 12 +- .../raft/linalg/reduce_rows_by_key.cuh | 44 +- cpp/include/raft/linalg/rsvd.cuh | 66 +- cpp/include/raft/linalg/sqrt.cuh | 4 +- cpp/include/raft/linalg/ternary_op.cuh | 15 +- cpp/test/linalg/power.cu | 204 ++--- cpp/test/linalg/reduce_cols_by_key.cu | 172 ++-- cpp/test/linalg/reduce_rows_by_key.cu | 358 ++++----- cpp/test/linalg/rsvd.cu | 474 +++++------ cpp/test/linalg/sqrt.cu | 166 ++-- cpp/test/linalg/ternary_op.cu | 138 ++-- 18 files changed, 1975 insertions(+), 1915 deletions(-) diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp index e8aeccc9b0..c91d6e41c1 100644 --- a/cpp/include/raft/linalg/detail/lstsq.hpp +++ b/cpp/include/raft/linalg/detail/lstsq.hpp @@ -36,44 +36,49 @@ #include namespace raft { - namespace linalg { - namespace detail { +namespace linalg { +namespace detail { - namespace { +namespace { /** Operate a CUDA event if we're in the concurrent mode; no-op otherwise. */ - struct DeviceEvent { - private: - cudaEvent_t e; - - public: - DeviceEvent(bool concurrent) { - if (concurrent) - RAFT_CUDA_TRY(cudaEventCreate(&e)); - else - e = nullptr; - } - - ~DeviceEvent() { - if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e)); - } - - operator cudaEvent_t() const { return e; } - - void record(cudaStream_t stream) { - if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream)); - } - - void wait(cudaStream_t stream) { - if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u)); - } - - void wait() { - if (e != nullptr) raft::interruptible::synchronize(e); - } - - DeviceEvent &operator=(const DeviceEvent &other) = delete; - }; +struct DeviceEvent { + private: + cudaEvent_t e; + + public: + DeviceEvent(bool concurrent) + { + if (concurrent) + RAFT_CUDA_TRY(cudaEventCreate(&e)); + else + e = nullptr; + } + + ~DeviceEvent() + { + if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e)); + } + + operator cudaEvent_t() const { return e; } + + void record(cudaStream_t stream) + { + if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream)); + } + + void wait(cudaStream_t stream) + { + if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u)); + } + + void wait() + { + if (e != nullptr) raft::interruptible::synchronize(e); + } + + DeviceEvent& operator=(const DeviceEvent& other) = delete; +}; /** * @brief Tells if the viewed CUDA stream is implicitly synchronized with the given stream. @@ -82,35 +87,36 @@ namespace raft { * if the two views point to the same stream * or sometimes when one of them is the legacy default stream. */ - bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b) { - // any stream is "synchronized" with itself - if (a.value() == b.value()) return true; - // legacy + blocking streams - unsigned int flags = 0; - if (a.is_default()) { - RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags)); - if ((flags & cudaStreamNonBlocking) == 0) return true; - } - if (b.is_default()) { - RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags)); - if ((flags & cudaStreamNonBlocking) == 0) return true; - } - return false; - } - - template - struct DivideByNonZero { - constexpr static const math_t - eps = math_t(1e-10); - - __device__ math_t - - operator()(const math_t a, const math_t b) const { - return raft::myAbs(b) >= eps ? a / b : a; - } - }; - - } // namespace +bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b) +{ + // any stream is "synchronized" with itself + if (a.value() == b.value()) return true; + // legacy + blocking streams + unsigned int flags = 0; + if (a.is_default()) { + RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags)); + if ((flags & cudaStreamNonBlocking) == 0) return true; + } + if (b.is_default()) { + RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags)); + if ((flags & cudaStreamNonBlocking) == 0) return true; + } + return false; +} + +template +struct DivideByNonZero { + constexpr static const math_t eps = math_t(1e-10); + + __device__ math_t + + operator()(const math_t a, const math_t b) const + { + return raft::myAbs(b) >= eps ? a / b : a; + } +}; + +} // namespace /** Solves the linear ordinary least squares problem `Aw = b` * Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine. @@ -118,58 +124,59 @@ namespace raft { * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, * so it's not guaranteed to stay unmodified. */ - template - void lstsqSvdQR(const raft::handle_t &handle, - math_t *A, - const int n_rows, - const int n_cols, - const math_t *b, - math_t *w, - cudaStream_t stream) { - const int minmn = min(n_rows, n_cols); - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - int cusolverWorkSetSize = 0; - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize( - cusolverH, n_rows, n_cols, &cusolverWorkSetSize)); - - rmm::device_uvector workset(cusolverWorkSetSize // cuSolver - + n_rows * minmn // U - + n_cols * n_cols // V - + minmn // S - + minmn // U^T * b - + 1 // devInfo - , - stream); - math_t *cusolverWorkSet = workset.data(); - math_t *U = cusolverWorkSet + cusolverWorkSetSize; - math_t *Vt = U + n_rows * minmn; - math_t *S = Vt + n_cols * n_cols; - math_t *Ub = S + minmn; - int *devInfo = reinterpret_cast(Ub + minmn); - - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd(cusolverH, - 'S', - 'S', - n_rows, - n_cols, - A, - n_rows, - S, - U, - n_rows, - Vt, - n_cols, - cusolverWorkSet, - cusolverWorkSetSize, - nullptr, - devInfo, - stream)); - raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream); - raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero(), stream); - raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream); - } +template +void lstsqSvdQR(const raft::handle_t& handle, + math_t* A, + const int n_rows, + const int n_cols, + const math_t* b, + math_t* w, + cudaStream_t stream) +{ + const int minmn = min(n_rows, n_cols); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + int cusolverWorkSetSize = 0; + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize( + cusolverH, n_rows, n_cols, &cusolverWorkSetSize)); + + rmm::device_uvector workset(cusolverWorkSetSize // cuSolver + + n_rows * minmn // U + + n_cols * n_cols // V + + minmn // S + + minmn // U^T * b + + 1 // devInfo + , + stream); + math_t* cusolverWorkSet = workset.data(); + math_t* U = cusolverWorkSet + cusolverWorkSetSize; + math_t* Vt = U + n_rows * minmn; + math_t* S = Vt + n_cols * n_cols; + math_t* Ub = S + minmn; + int* devInfo = reinterpret_cast(Ub + minmn); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd(cusolverH, + 'S', + 'S', + n_rows, + n_cols, + A, + n_rows, + S, + U, + n_rows, + Vt, + n_cols, + cusolverWorkSet, + cusolverWorkSetSize, + nullptr, + devInfo, + stream)); + raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream); + raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero(), stream); + raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream); +} /** Solves the linear ordinary least squares problem `Aw = b` * Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER). @@ -177,159 +184,161 @@ namespace raft { * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, * so it's not guaranteed to stay unmodified. */ - template - void lstsqSvdJacobi(const raft::handle_t &handle, - math_t *A, - const int n_rows, - const int n_cols, - const math_t *b, - math_t *w, - cudaStream_t stream) { - const int minmn = min(n_rows, n_cols); - gesvdjInfo_t gesvdj_params; - RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params)); - int cusolverWorkSetSize = 0; - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY( - raft::linalg::detail::cusolverDngesvdj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - 1, - n_rows, - n_cols, - A, - n_rows, - nullptr, - nullptr, - n_rows, - nullptr, - n_cols, - &cusolverWorkSetSize, - gesvdj_params)); - rmm::device_uvector workset(cusolverWorkSetSize // cuSolver - + n_rows * minmn // U - + n_cols * minmn // V - + minmn // S - + minmn // U^T * b - + 1 // devInfo - , - stream); - math_t *cusolverWorkSet = workset.data(); - math_t *U = cusolverWorkSet + cusolverWorkSetSize; - math_t *V = U + n_rows * minmn; - math_t *S = V + n_cols * minmn; - math_t *Ub = S + minmn; - int *devInfo = reinterpret_cast(Ub + minmn); - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - 1, - n_rows, - n_cols, - A, - n_rows, - S, - U, - n_rows, - V, - n_cols, - cusolverWorkSet, - cusolverWorkSetSize, - devInfo, - gesvdj_params, - stream)); - raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream); - raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero(), stream); - raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream); - } +template +void lstsqSvdJacobi(const raft::handle_t& handle, + math_t* A, + const int n_rows, + const int n_cols, + const math_t* b, + math_t* w, + cudaStream_t stream) +{ + const int minmn = min(n_rows, n_cols); + gesvdjInfo_t gesvdj_params; + RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params)); + int cusolverWorkSetSize = 0; + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY( + raft::linalg::detail::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + 1, + n_rows, + n_cols, + A, + n_rows, + nullptr, + nullptr, + n_rows, + nullptr, + n_cols, + &cusolverWorkSetSize, + gesvdj_params)); + rmm::device_uvector workset(cusolverWorkSetSize // cuSolver + + n_rows * minmn // U + + n_cols * minmn // V + + minmn // S + + minmn // U^T * b + + 1 // devInfo + , + stream); + math_t* cusolverWorkSet = workset.data(); + math_t* U = cusolverWorkSet + cusolverWorkSetSize; + math_t* V = U + n_rows * minmn; + math_t* S = V + n_cols * minmn; + math_t* Ub = S + minmn; + int* devInfo = reinterpret_cast(Ub + minmn); + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + 1, + n_rows, + n_cols, + A, + n_rows, + S, + U, + n_rows, + V, + n_cols, + cusolverWorkSet, + cusolverWorkSetSize, + devInfo, + gesvdj_params, + stream)); + raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream); + raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero(), stream); + raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream); +} /** Solves the linear ordinary least squares problem `Aw = b` * via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A). * (`w = (A^T A)^-1 A^T b`) */ - template - void lstsqEig(const raft::handle_t &handle, - const math_t *A, - const int n_rows, - const int n_cols, - const math_t *b, - math_t *w, - cudaStream_t stream) { - rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream); - rmm::cuda_stream_view multAbStream = mainStream; - bool concurrent = false; - { - int sp_size = handle.get_stream_pool_size(); - if (sp_size > 0) { - multAbStream = handle.get_stream_from_stream_pool(0); - // check if the two streams can run concurrently - if (!are_implicitly_synchronized(mainStream, multAbStream)) { - concurrent = true; - } else if (sp_size > 1) { - mainStream = multAbStream; - multAbStream = handle.get_stream_from_stream_pool(1); - concurrent = true; - } - } - } - // the event is created only if the given raft handle is capable of running - // at least two CUDA streams without implicit synchronization. - DeviceEvent multAbDone(concurrent); - - rmm::device_uvector workset(n_cols * n_cols * 3 + n_cols * 2, mainStream); - math_t *Q = workset.data(); - math_t *QS = Q + n_cols * n_cols; - math_t *covA = QS + n_cols * n_cols; - math_t *S = covA + n_cols * n_cols; - math_t *Ab = S + n_cols; - - // covA <- A* A - math_t alpha = math_t(1); - math_t beta = math_t(0); - raft::linalg::gemm(handle, - A, - n_rows, - n_cols, - A, - covA, - n_cols, - n_cols, - CUBLAS_OP_T, - CUBLAS_OP_N, - alpha, - beta, - mainStream); - - // Ab <- A* b - raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream); - multAbDone.record(multAbStream); - - // Q S Q* <- covA - raft::common::nvtx::push_range("raft::linalg::eigDC"); - raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream); - raft::common::nvtx::pop_range(); - - // QS <- Q invS - raft::linalg::matrixVectorOp( - QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero(), mainStream); - // covA <- QS Q* == Q invS Q* == inv(A* A) - raft::linalg::gemm(handle, - QS, - n_cols, - n_cols, - Q, - covA, - n_cols, - n_cols, - CUBLAS_OP_N, - CUBLAS_OP_T, - alpha, - beta, - mainStream); - multAbDone.wait(mainStream); - // w <- covA Ab == Q invS Q* A b == inv(A* A) A b - raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream); - } +template +void lstsqEig(const raft::handle_t& handle, + const math_t* A, + const int n_rows, + const int n_cols, + const math_t* b, + math_t* w, + cudaStream_t stream) +{ + rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream); + rmm::cuda_stream_view multAbStream = mainStream; + bool concurrent = false; + { + int sp_size = handle.get_stream_pool_size(); + if (sp_size > 0) { + multAbStream = handle.get_stream_from_stream_pool(0); + // check if the two streams can run concurrently + if (!are_implicitly_synchronized(mainStream, multAbStream)) { + concurrent = true; + } else if (sp_size > 1) { + mainStream = multAbStream; + multAbStream = handle.get_stream_from_stream_pool(1); + concurrent = true; + } + } + } + // the event is created only if the given raft handle is capable of running + // at least two CUDA streams without implicit synchronization. + DeviceEvent multAbDone(concurrent); + + rmm::device_uvector workset(n_cols * n_cols * 3 + n_cols * 2, mainStream); + math_t* Q = workset.data(); + math_t* QS = Q + n_cols * n_cols; + math_t* covA = QS + n_cols * n_cols; + math_t* S = covA + n_cols * n_cols; + math_t* Ab = S + n_cols; + + // covA <- A* A + math_t alpha = math_t(1); + math_t beta = math_t(0); + raft::linalg::gemm(handle, + A, + n_rows, + n_cols, + A, + covA, + n_cols, + n_cols, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, + mainStream); + + // Ab <- A* b + raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream); + multAbDone.record(multAbStream); + + // Q S Q* <- covA + raft::common::nvtx::push_range("raft::linalg::eigDC"); + raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream); + raft::common::nvtx::pop_range(); + + // QS <- Q invS + raft::linalg::matrixVectorOp( + QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero(), mainStream); + // covA <- QS Q* == Q invS Q* == inv(A* A) + raft::linalg::gemm(handle, + QS, + n_cols, + n_cols, + Q, + covA, + n_cols, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_T, + alpha, + beta, + mainStream); + multAbDone.wait(mainStream); + // w <- covA Ab == Q invS Q* A b == inv(A* A) A b + raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream); +} /** Solves the linear ordinary least squares problem `Aw = b` * via QR decomposition of `A = QR`. @@ -340,105 +349,106 @@ namespace raft { * @param b[in/out] - input target vector. * Warning: the content of this vector is modified by the cuSOLVER routines. */ - template - void lstsqQR(const raft::handle_t &handle, - math_t *A, - const int n_rows, - const int n_cols, - math_t *b, - math_t *w, - cudaStream_t stream) { - cublasHandle_t cublasH = handle.get_cublas_handle(); - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - int m = n_rows; - int n = n_cols; - - int info = 0; - rmm::device_uvector d_tau(n, stream); - rmm::device_scalar d_info(stream); - - const cublasSideMode_t side = CUBLAS_SIDE_LEFT; - const cublasOperation_t trans = CUBLAS_OP_T; - - int lwork_geqrf = 0; - int lwork_ormqr = 0; - int lwork = 0; - - const int lda = m; - const int ldb = m; - - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY( - raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf)); - - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH, - side, - trans, - m, - 1, - n, - A, - lda, - d_tau.data(), - b, // C, - lda, // ldc, - &lwork_ormqr)); - - lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr; - - rmm::device_uvector d_work(lwork, stream); - - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf( - cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream)); - - RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - ASSERT(0 == info, "lstsq.h: QR wasn't successful"); - - // #TODO: Call from public API when ready - RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH, - side, - trans, - m, - 1, - n, - A, - lda, - d_tau.data(), - b, - ldb, - d_work.data(), - lwork, - d_info.data(), - stream)); - - RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - ASSERT(0 == info, "lstsq.h: QR wasn't successful"); - - const math_t one = 1; - - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH, - side, - CUBLAS_FILL_MODE_UPPER, - CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, - n, - 1, - &one, - A, - lda, - b, - ldb, - stream)); - - RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream)); - } - - }; // namespace detail - }; // namespace linalg +template +void lstsqQR(const raft::handle_t& handle, + math_t* A, + const int n_rows, + const int n_cols, + math_t* b, + math_t* w, + cudaStream_t stream) +{ + cublasHandle_t cublasH = handle.get_cublas_handle(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int m = n_rows; + int n = n_cols; + + int info = 0; + rmm::device_uvector d_tau(n, stream); + rmm::device_scalar d_info(stream); + + const cublasSideMode_t side = CUBLAS_SIDE_LEFT; + const cublasOperation_t trans = CUBLAS_OP_T; + + int lwork_geqrf = 0; + int lwork_ormqr = 0; + int lwork = 0; + + const int lda = m; + const int ldb = m; + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY( + raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf)); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH, + side, + trans, + m, + 1, + n, + A, + lda, + d_tau.data(), + b, // C, + lda, // ldc, + &lwork_ormqr)); + + lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr; + + rmm::device_uvector d_work(lwork, stream); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf( + cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + ASSERT(0 == info, "lstsq.h: QR wasn't successful"); + + // #TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH, + side, + trans, + m, + 1, + n, + A, + lda, + d_tau.data(), + b, + ldb, + d_work.data(), + lwork, + d_info.data(), + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + ASSERT(0 == info, "lstsq.h: QR wasn't successful"); + + const math_t one = 1; + + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH, + side, + CUBLAS_FILL_MODE_UPPER, + CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, + n, + 1, + &one, + A, + lda, + b, + ldb, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream)); +} + +}; // namespace detail +}; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh index 307ed30c57..54cf9aa204 100644 --- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh @@ -22,25 +22,26 @@ #include namespace raft { - namespace linalg { - namespace detail { +namespace linalg { +namespace detail { ///@todo: support col-major ///@todo: specialize this to support shared-mem based atomics - template - __global__ void reduce_cols_by_key_kernel( - const T *data, const KeyIteratorT keys, T *out, IdxType nrows, IdxType ncols, IdxType nkeys) { - typedef typename std::iterator_traits::value_type KeyType; +template +__global__ void reduce_cols_by_key_kernel( + const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys) +{ + typedef typename std::iterator_traits::value_type KeyType; - IdxType idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= (nrows * ncols)) return; - ///@todo: yikes! use fast-int-div - IdxType colId = idx % ncols; - IdxType rowId = idx / ncols; - KeyType key = keys[colId]; - raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]); - } + IdxType idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= (nrows * ncols)) return; + ///@todo: yikes! use fast-int-div + IdxType colId = idx % ncols; + IdxType rowId = idx / ncols; + KeyType key = keys[colId]; + raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]); +} /** * @brief Computes the sum-reduction of matrix columns for each given key @@ -60,22 +61,23 @@ namespace raft { * @param nkeys number of unique keys in the keys array * @param stream cuda stream to launch the kernel onto */ - template - void reduce_cols_by_key(const T *data, - const KeyIteratorT keys, - T *out, - IdxType nrows, - IdxType ncols, - IdxType nkeys, - cudaStream_t stream) { - typedef typename std::iterator_traits::value_type KeyType; +template +void reduce_cols_by_key(const T* data, + const KeyIteratorT keys, + T* out, + IdxType nrows, + IdxType ncols, + IdxType nkeys, + cudaStream_t stream) +{ + typedef typename std::iterator_traits::value_type KeyType; - RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream)); - constexpr int TPB = 256; - int nblks = (int) raft::ceildiv(nrows * ncols, TPB); - reduce_cols_by_key_kernel<<>>(data, keys, out, nrows, ncols, nkeys); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - }; // end namespace detail - }; // end namespace linalg + RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream)); + constexpr int TPB = 256; + int nblks = (int)raft::ceildiv(nrows * ncols, TPB); + reduce_cols_by_key_kernel<<>>(data, keys, out, nrows, ncols, nkeys); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} +}; // end namespace detail +}; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index f4fa892472..c88895807d 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -24,53 +24,56 @@ #define MAX_BLOCKS 65535u namespace raft { - namespace linalg { - namespace detail { +namespace linalg { +namespace detail { // // Small helper function to convert from int->char and char->int // Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars // - template - void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) { - for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) { - dst[idx] = src[idx]; - } - } +template +void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) +{ + for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) { + dst[idx] = src[idx]; + } +} - template - void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) { - dim3 grid, block; - block.x = 256; +template +void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) +{ + dim3 grid, block; + block.x = 256; - grid.x = raft::ceildiv(n, (int) block.x); - grid.x = std::min(grid.x, MAX_BLOCKS); + grid.x = raft::ceildiv(n, (int)block.x); + grid.x = std::min(grid.x, MAX_BLOCKS); - convert_array_kernel<<>>(dst, src, n); - } + convert_array_kernel<<>>(dst, src, n); +} - template - struct quad { - T x, y, z, w; - }; +template +struct quad { + T x, y, z, w; +}; // // Functor for reduce by key, small k // - template - struct quadSum { - __host__ __device__ __forceinline__ quad operator()(const quad &a, const quad &b) const { - // wasting a double4.. - quad c; - c.x = a.x + b.x; - c.y = a.y + b.y; - c.z = a.z + b.z; - c.w = a.w + b.w; - - return c; - } - }; +template +struct quadSum { + __host__ __device__ __forceinline__ quad operator()(const quad& a, const quad& b) const + { + // wasting a double4.. + quad c; + c.x = a.x + b.x; + c.y = a.y + b.y; + c.z = a.z + b.z; + c.w = a.w + b.w; + + return c; + } +}; // // Reduce by keys @@ -84,92 +87,93 @@ namespace raft { #define SUM_ROWS_SMALL_K_DIMX 256 #define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4 - template - __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) - - __global__ - void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, - int lda, - const char *d_keys, - const WeightT *d_weights, - int nrows, - int ncols, - int nkeys, - DataIteratorT d_sums) { - typedef typename std::iterator_traits::value_type DataType; - typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int idim = static_cast(blockIdx.y); idim < ncols; idim += gridDim.y) { - if (idim != static_cast(blockIdx.y)) __syncthreads(); // we're reusing temp_storage - - // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg - quad thread_sums; - thread_sums.x = 0.0; - thread_sums.y = 0.0; - thread_sums.z = 0.0; - thread_sums.w = 0.0; - - // May use vectorized load - not necessary for doubles - for (int block_offset_irow = blockIdx.x * blockDim.x; - block_offset_irow < nrows; // we will syncthreads() inside the loop, no CTA divergence - block_offset_irow += blockDim.x * gridDim.x) { - int irow = block_offset_irow + threadIdx.x; - DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0; - if (d_weights && irow < nrows) { val = val * d_weights[irow]; } - // we are not reusing the keys - after profiling - // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded - // (experimentation gave a 10% speed up - not worth the many code lines added) - int row_key = (irow < nrows) ? d_keys[irow] : -1; - - thread_sums.x += (row_key == 0) ? val : 0.0; - thread_sums.y += (row_key == 1) ? val : 0.0; - thread_sums.z += (row_key == 2) ? val : 0.0; - thread_sums.w += (row_key == 3) ? val : 0.0; - } - - // End of column - // Saving local sums back to global mem - - // Strided access - - // Reducing by key - thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum()); - - if (threadIdx.x < 32) { - // We only need 4 - thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff); - if (static_cast(threadIdx.x) < nkeys) { - if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x); - if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y); - if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z); - if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w); - } - } - } - } - - template - void sum_rows_by_key_small_nkeys(const DataIteratorT d_A, - int lda, - const char *d_keys, - const WeightT *d_weights, - int nrows, - int ncols, - int nkeys, - DataIteratorT d_sums, - cudaStream_t st) { - dim3 grid, block; - block.x = SUM_ROWS_SMALL_K_DIMX; - block.y = 1; // Necessary - - grid.x = raft::ceildiv(nrows, (int) block.x); - grid.x = std::min(grid.x, 32u); - grid.y = ncols; - grid.y = std::min(grid.y, MAX_BLOCKS); - sum_rows_by_key_small_nkeys_kernel<<>>( - d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums); - } +template +__launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) + + __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, + int lda, + const char* d_keys, + const WeightT* d_weights, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums) +{ + typedef typename std::iterator_traits::value_type DataType; + typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int idim = static_cast(blockIdx.y); idim < ncols; idim += gridDim.y) { + if (idim != static_cast(blockIdx.y)) __syncthreads(); // we're reusing temp_storage + + // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg + quad thread_sums; + thread_sums.x = 0.0; + thread_sums.y = 0.0; + thread_sums.z = 0.0; + thread_sums.w = 0.0; + + // May use vectorized load - not necessary for doubles + for (int block_offset_irow = blockIdx.x * blockDim.x; + block_offset_irow < nrows; // we will syncthreads() inside the loop, no CTA divergence + block_offset_irow += blockDim.x * gridDim.x) { + int irow = block_offset_irow + threadIdx.x; + DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0; + if (d_weights && irow < nrows) { val = val * d_weights[irow]; } + // we are not reusing the keys - after profiling + // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded + // (experimentation gave a 10% speed up - not worth the many code lines added) + int row_key = (irow < nrows) ? d_keys[irow] : -1; + + thread_sums.x += (row_key == 0) ? val : 0.0; + thread_sums.y += (row_key == 1) ? val : 0.0; + thread_sums.z += (row_key == 2) ? val : 0.0; + thread_sums.w += (row_key == 3) ? val : 0.0; + } + + // End of column + // Saving local sums back to global mem + + // Strided access + + // Reducing by key + thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum()); + + if (threadIdx.x < 32) { + // We only need 4 + thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff); + if (static_cast(threadIdx.x) < nkeys) { + if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x); + if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y); + if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z); + if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w); + } + } + } +} + +template +void sum_rows_by_key_small_nkeys(const DataIteratorT d_A, + int lda, + const char* d_keys, + const WeightT* d_weights, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t st) +{ + dim3 grid, block; + block.x = SUM_ROWS_SMALL_K_DIMX; + block.y = 1; // Necessary + + grid.x = raft::ceildiv(nrows, (int)block.x); + grid.x = std::min(grid.x, 32u); + grid.y = ncols; + grid.y = std::min(grid.y, MAX_BLOCKS); + sum_rows_by_key_small_nkeys_kernel<<>>( + d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums); +} // // Reduce by keys - large number of keys @@ -179,160 +183,163 @@ namespace raft { #define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024 - template - __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, - int lda, - const KeysIteratorT d_keys, - const WeightT *d_weights, - int nrows, - int ncols, - int key_offset, - int nkeys, - DataIteratorT d_sums) { - typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; - __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K]; - - for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) - local_sums[local_key] = 0.0; - - for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) { - __syncthreads(); // local_sums - - // At this point local_sums if full of zeros - - for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows; - irow += blockDim.x * gridDim.x) { - // Branch div in this loop - not an issue with current code - DataType val = d_A[idim * lda + irow]; - if (d_weights) val = val * d_weights[irow]; - - int local_key = d_keys[irow] - key_offset; - - // We could load next val here - raft::myAtomicAdd(&local_sums[local_key], val); - } - - __syncthreads(); // local_sums - - for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) { - DataType local_sum = local_sums[local_key]; - - if (local_sum != 0.0) { - KeyType global_key = key_offset + local_key; - raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum); - local_sums[local_key] = 0.0; - } - } - } - } - - template - void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A, - int lda, - KeysIteratorT d_keys, - int nrows, - int ncols, - int key_offset, - int nkeys, - DataIteratorT d_sums, - cudaStream_t st) { - dim3 grid, block; - block.x = SUM_ROWS_SMALL_K_DIMX; - block.y = 1; // Necessary - - grid.x = raft::ceildiv(nrows, (int) block.x); - grid.x = std::min(grid.x, 32u); - grid.y = ncols; - grid.y = std::min(grid.y, MAX_BLOCKS); - sum_rows_by_key_large_nkeys_kernel_colmajor<<>>( - d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums); - } +template +__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT* d_weights, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums) +{ + typedef typename std::iterator_traits::value_type KeyType; + typedef typename std::iterator_traits::value_type DataType; + __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K]; + + for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) + local_sums[local_key] = 0.0; + + for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) { + __syncthreads(); // local_sums + + // At this point local_sums if full of zeros + + for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows; + irow += blockDim.x * gridDim.x) { + // Branch div in this loop - not an issue with current code + DataType val = d_A[idim * lda + irow]; + if (d_weights) val = val * d_weights[irow]; + + int local_key = d_keys[irow] - key_offset; + + // We could load next val here + raft::myAtomicAdd(&local_sums[local_key], val); + } + + __syncthreads(); // local_sums + + for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) { + DataType local_sum = local_sums[local_key]; + + if (local_sum != 0.0) { + KeyType global_key = key_offset + local_key; + raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum); + local_sums[local_key] = 0.0; + } + } + } +} + +template +void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A, + int lda, + KeysIteratorT d_keys, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums, + cudaStream_t st) +{ + dim3 grid, block; + block.x = SUM_ROWS_SMALL_K_DIMX; + block.y = 1; // Necessary + + grid.x = raft::ceildiv(nrows, (int)block.x); + grid.x = std::min(grid.x, 32u); + grid.y = ncols; + grid.y = std::min(grid.y, MAX_BLOCKS); + sum_rows_by_key_large_nkeys_kernel_colmajor<<>>( + d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums); +} #define RRBK_SHMEM_SZ 32 //#define RRBK_SHMEM - template - __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, - int lda, - const WeightT *d_weights, - KeysIteratorT d_keys, - int nrows, - int ncols, - int key_offset, - int nkeys, - DataIteratorT d_sums) { - typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; +template +__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, + int lda, + const WeightT* d_weights, + KeysIteratorT d_keys, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums) +{ + typedef typename std::iterator_traits::value_type KeyType; + typedef typename std::iterator_traits::value_type DataType; #ifdef RRBK_SHMEM - __shared__ KeyType sh_keys[RRBK_SHMEM_SZ]; + __shared__ KeyType sh_keys[RRBK_SHMEM_SZ]; #endif - int rows_per_partition = nrows / gridDim.z + 1; - int start_row = blockIdx.z * rows_per_partition; - int end_row = start_row + rows_per_partition; - end_row = end_row > nrows ? nrows : end_row; - - KeyType local_key = blockIdx.y; - if (local_key >= nkeys) return; - int this_col = threadIdx.x + blockIdx.x * blockDim.x; - if (this_col >= ncols) return; - - DataType sum = 0.0; - KeyType global_key = key_offset + local_key; + int rows_per_partition = nrows / gridDim.z + 1; + int start_row = blockIdx.z * rows_per_partition; + int end_row = start_row + rows_per_partition; + end_row = end_row > nrows ? nrows : end_row; + + KeyType local_key = blockIdx.y; + if (local_key >= nkeys) return; + int this_col = threadIdx.x + blockIdx.x * blockDim.x; + if (this_col >= ncols) return; + + DataType sum = 0.0; + KeyType global_key = key_offset + local_key; #ifdef RRBK_SHMEM - int sh_key_inx = 0; + int sh_key_inx = 0; #endif - for (int r = start_row; r < end_row; r++) { + for (int r = start_row; r < end_row; r++) { #ifdef RRBK_SHMEM - if (0 == sh_key_inx % RRBK_SHMEM_SZ) { - for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x) - sh_keys[x] = d_keys[r + x]; - __syncthreads(); - } - if (sh_keys[sh_key_inx] != global_key) continue; // No divergence since global_key is the - // same for the whole block - sh_key_inx++; + if (0 == sh_key_inx % RRBK_SHMEM_SZ) { + for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x) + sh_keys[x] = d_keys[r + x]; + __syncthreads(); + } + if (sh_keys[sh_key_inx] != global_key) continue; // No divergence since global_key is the + // same for the whole block + sh_key_inx++; #else - if (d_keys[r] != global_key) - continue; // No divergence since global_key is the - // same for the whole block + if (d_keys[r] != global_key) continue; // No divergence since global_key is the + // same for the whole block #endif - // if ((end_row-start_row) / (r-start_row) != global_key) continue; - DataType val = __ldcg(&d_A[r * lda + this_col]); - if (d_weights) { val = val * d_weights[r]; } - sum += val; - } - - if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum); - } - - template - void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A, - int lda, - const KeysIteratorT d_keys, - const WeightT *d_weights, - int nrows, - int ncols, - int key_offset, - int nkeys, - DataIteratorT d_sums, - cudaStream_t st) { - // x-dim refers to the column in the input data - // y-dim refers to the key - // z-dim refers to a partitioning of the rows among the threadblocks - dim3 grid, block; - block.x = 256; // Adjust me! - block.y = 1; // Don't adjust me! - grid.x = raft::ceildiv(ncols, (int) block.x); - grid.y = nkeys; - grid.z = std::max(40960000 / nkeys / ncols, (int) 1); // Adjust me! - grid.z = std::min(grid.z, (unsigned int) nrows); - grid.z = std::min(grid.z, MAX_BLOCKS); - - sum_rows_by_key_large_nkeys_kernel_rowmajor<<>>( - d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums); - } + // if ((end_row-start_row) / (r-start_row) != global_key) continue; + DataType val = __ldcg(&d_A[r * lda + this_col]); + if (d_weights) { val = val * d_weights[r]; } + sum += val; + } + + if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum); +} + +template +void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT* d_weights, + int nrows, + int ncols, + int key_offset, + int nkeys, + DataIteratorT d_sums, + cudaStream_t st) +{ + // x-dim refers to the column in the input data + // y-dim refers to the key + // z-dim refers to a partitioning of the rows among the threadblocks + dim3 grid, block; + block.x = 256; // Adjust me! + block.y = 1; // Don't adjust me! + grid.x = raft::ceildiv(ncols, (int)block.x); + grid.y = nkeys; + grid.z = std::max(40960000 / nkeys / ncols, (int)1); // Adjust me! + grid.z = std::min(grid.z, (unsigned int)nrows); + grid.z = std::min(grid.z, MAX_BLOCKS); + + sum_rows_by_key_large_nkeys_kernel_rowmajor<<>>( + d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums); +} /** * @brief Computes the weighted reduction of matrix rows for each given key @@ -353,39 +360,40 @@ namespace raft { * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream */ - template - void reduce_rows_by_key(const DataIteratorT d_A, - int lda, - const KeysIteratorT d_keys, - const WeightT *d_weights, - char *d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT d_sums, - cudaStream_t stream) { - typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; - - // Following kernel needs memset - cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream); - - if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) { - // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW - // with doubles we have ~20% speed up - with floats we can hope something around 2x - // Converting d_keys to char - convert_array(d_keys_char, d_keys, nrows, stream); - sum_rows_by_key_small_nkeys( - d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream); - } else { - for (KeyType key_offset = 0; key_offset < static_cast(nkeys); - key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) { - KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys); - sum_rows_by_key_large_nkeys_rowmajor( - d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream); - } - } - } +template +void reduce_rows_by_key(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + const WeightT* d_weights, + char* d_keys_char, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t stream) +{ + typedef typename std::iterator_traits::value_type KeyType; + typedef typename std::iterator_traits::value_type DataType; + + // Following kernel needs memset + cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream); + + if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) { + // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW + // with doubles we have ~20% speed up - with floats we can hope something around 2x + // Converting d_keys to char + convert_array(d_keys_char, d_keys, nrows, stream); + sum_rows_by_key_small_nkeys( + d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream); + } else { + for (KeyType key_offset = 0; key_offset < static_cast(nkeys); + key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) { + KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys); + sum_rows_by_key_large_nkeys_rowmajor( + d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream); + } + } +} /** * @brief Computes the reduction of matrix rows for each given key @@ -403,29 +411,30 @@ namespace raft { * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream */ - template - void reduce_rows_by_key(const DataIteratorT d_A, - int lda, - const KeysIteratorT d_keys, - char *d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT d_sums, - cudaStream_t stream) { - typedef typename std::iterator_traits::value_type DataType; - reduce_rows_by_key(d_A, - lda, - d_keys, - static_cast(nullptr), - d_keys_char, - nrows, - ncols, - nkeys, - d_sums, - stream); - } - - }; // end namespace detail - }; // end namespace linalg +template +void reduce_rows_by_key(const DataIteratorT d_A, + int lda, + const KeysIteratorT d_keys, + char* d_keys_char, + int nrows, + int ncols, + int nkeys, + DataIteratorT d_sums, + cudaStream_t stream) +{ + typedef typename std::iterator_traits::value_type DataType; + reduce_rows_by_key(d_A, + lda, + d_keys, + static_cast(nullptr), + d_keys_char, + nrows, + ncols, + nkeys, + d_sums, + stream); +} + +}; // end namespace detail +}; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh index 700ce43735..88436eda64 100644 --- a/cpp/include/raft/linalg/detail/rsvd.cuh +++ b/cpp/include/raft/linalg/detail/rsvd.cuh @@ -27,8 +27,8 @@ #include namespace raft { - namespace linalg { - namespace detail { +namespace linalg { +namespace detail { /** * @brief randomized singular value decomposition (RSVD) on the column major @@ -51,302 +51,301 @@ namespace raft { * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers * @param stream cuda stream */ - template - void rsvdFixedRank(const raft::handle_t &handle, - math_t *M, - int n_rows, - int n_cols, - math_t *S_vec, - math_t *U, - math_t *V, - int k, - int p, - bool use_bbt, - bool gen_left_vec, - bool gen_right_vec, - bool use_jacobi, - math_t tol, - int max_sweeps, - cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); +template +void rsvdFixedRank(const raft::handle_t& handle, + math_t* M, + int n_rows, + int n_cols, + math_t* S_vec, + math_t* U, + math_t* V, + int k, + int p, + bool use_bbt, + bool gen_left_vec, + bool gen_right_vec, + bool use_jacobi, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); - // All the notations are following Algorithm 4 & 5 in S. Voronin's paper: - // https://arxiv.org/abs/1502.05366 + // All the notations are following Algorithm 4 & 5 in S. Voronin's paper: + // https://arxiv.org/abs/1502.05366 - int m = n_rows, n = n_cols; - int l = k + p; // Total number of singular values to be computed before truncation - int q = 2; // Number of power sampling counts - int s = 1; // Frequency controller for QR decomposition during power sampling - // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s - // > 2: less frequent QR + int m = n_rows, n = n_cols; + int l = k + p; // Total number of singular values to be computed before truncation + int q = 2; // Number of power sampling counts + int s = 1; // Frequency controller for QR decomposition during power sampling + // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s + // > 2: less frequent QR - const math_t alpha = 1.0, beta = 0.0; + const math_t alpha = 1.0, beta = 0.0; - // Build temporary U, S, V matrices - rmm::device_uvector S_vec_tmp(l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream)); + // Build temporary U, S, V matrices + rmm::device_uvector S_vec_tmp(l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream)); - // build random matrix - rmm::device_uvector RN(n * l, stream); - raft::random::Rng rng(484); - rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream); + // build random matrix + rmm::device_uvector RN(n * l, stream); + raft::random::Rng rng(484); + rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream); - // multiply to get matrix of random samples Y - rmm::device_uvector Y(m * l, stream); - raft::linalg::gemm( - handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + // multiply to get matrix of random samples Y + rmm::device_uvector Y(m * l, stream); + raft::linalg::gemm( + handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); - // now build up (M M^T)^q R - rmm::device_uvector Z(n * l, stream); - rmm::device_uvector Yorth(m * l, stream); - rmm::device_uvector Zorth(n * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream)); + // now build up (M M^T)^q R + rmm::device_uvector Z(n * l, stream); + rmm::device_uvector Yorth(m * l, stream); + rmm::device_uvector Zorth(n * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream)); - // power sampling scheme - for (int j = 1; j < q; j++) { - if ((2 * j - 2) % s == 0) { - raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream); - raft::linalg::gemm(handle, - M, - m, - n, - Yorth.data(), - Z.data(), - n, - l, - CUBLAS_OP_T, - CUBLAS_OP_N, - alpha, - beta, - stream); - } else { - raft::linalg::gemm( - handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, - stream); - } + // power sampling scheme + for (int j = 1; j < q; j++) { + if ((2 * j - 2) % s == 0) { + raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream); + raft::linalg::gemm(handle, + M, + m, + n, + Yorth.data(), + Z.data(), + n, + l, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, + stream); + } else { + raft::linalg::gemm( + handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + } - if ((2 * j - 1) % s == 0) { - raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream); - raft::linalg::gemm(handle, - M, - m, - n, - Zorth.data(), - Y.data(), - m, - l, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); - } else { - raft::linalg::gemm( - handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, - stream); - } - } + if ((2 * j - 1) % s == 0) { + raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream); + raft::linalg::gemm(handle, + M, + m, + n, + Zorth.data(), + Y.data(), + m, + l, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + } else { + raft::linalg::gemm( + handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + } + } - // orthogonalize on exit from loop to get Q - rmm::device_uvector Q(m * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); - raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream); + // orthogonalize on exit from loop to get Q + rmm::device_uvector Q(m * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); + raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream); - // either QR of B^T method, or eigendecompose BB^T method - if (!use_bbt) { - // form Bt = Mt*Q : nxm * mxl = nxl - rmm::device_uvector Bt(n * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream)); - raft::linalg::gemm( - handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + // either QR of B^T method, or eigendecompose BB^T method + if (!use_bbt) { + // form Bt = Mt*Q : nxm * mxl = nxl + rmm::device_uvector Bt(n * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream)); + raft::linalg::gemm( + handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); - // compute QR factorization of Bt - // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */ - rmm::device_uvector Qhat(n * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream)); - rmm::device_uvector Rhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream)); - raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream); + // compute QR factorization of Bt + // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */ + rmm::device_uvector Qhat(n * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream)); + rmm::device_uvector Rhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream)); + raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream); - // compute SVD of Rhat (lxl) - rmm::device_uvector Uhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); - rmm::device_uvector Vhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream)); - if (use_jacobi) - raft::linalg::svdJacobi(handle, - Rhat.data(), - l, - l, - S_vec_tmp.data(), - Uhat.data(), - Vhat.data(), - true, - true, - tol, - max_sweeps, - stream); - else - raft::linalg::svdQR(handle, - Rhat.data(), - l, - l, - S_vec_tmp.data(), - Uhat.data(), - Vhat.data(), - true, - true, - true, - stream); - raft::matrix::sliceMatrix(S_vec_tmp.data(), - 1, - l, - S_vec, - 0, - 0, - 1, - k, - stream); // First k elements of S_vec + // compute SVD of Rhat (lxl) + rmm::device_uvector Uhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + rmm::device_uvector Vhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream)); + if (use_jacobi) + raft::linalg::svdJacobi(handle, + Rhat.data(), + l, + l, + S_vec_tmp.data(), + Uhat.data(), + Vhat.data(), + true, + true, + tol, + max_sweeps, + stream); + else + raft::linalg::svdQR(handle, + Rhat.data(), + l, + l, + S_vec_tmp.data(), + Uhat.data(), + Vhat.data(), + true, + true, + true, + stream); + raft::matrix::sliceMatrix(S_vec_tmp.data(), + 1, + l, + S_vec, + 0, + 0, + 1, + k, + stream); // First k elements of S_vec - // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk - if (gen_left_vec) { - raft::linalg::gemm(handle, - Q.data(), - m, - l, - Vhat.data(), - U, - m, - k /*used to be l and needs slicing*/, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); - } + // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk + if (gen_left_vec) { + raft::linalg::gemm(handle, + Q.data(), + m, + l, + Vhat.data(), + U, + m, + k /*used to be l and needs slicing*/, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + } - // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk - if (gen_right_vec) { - raft::linalg::gemm(handle, - Qhat.data(), - n, - l, - Uhat.data(), - V, - n, - k /*used to be l and needs slicing*/, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); - } - } else { - // build the matrix B B^T = Q^T M M^T Q column by column - // Bt = M^T Q ; nxm * mxk = nxk - rmm::device_uvector B(n * l, stream); - raft::linalg::gemm( - handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk + if (gen_right_vec) { + raft::linalg::gemm(handle, + Qhat.data(), + n, + l, + Uhat.data(), + V, + n, + k /*used to be l and needs slicing*/, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + } + } else { + // build the matrix B B^T = Q^T M M^T Q column by column + // Bt = M^T Q ; nxm * mxk = nxk + rmm::device_uvector B(n * l, stream); + raft::linalg::gemm( + handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); - rmm::device_uvector BBt(l * l, stream); - raft::linalg::gemm(handle, - B.data(), - l, - n, - B.data(), - BBt.data(), - l, - l, - CUBLAS_OP_N, - CUBLAS_OP_T, - alpha, - beta, - stream); + rmm::device_uvector BBt(l * l, stream); + raft::linalg::gemm(handle, + B.data(), + l, + n, + B.data(), + BBt.data(), + l, + l, + CUBLAS_OP_N, + CUBLAS_OP_T, + alpha, + beta, + stream); - // compute eigendecomposition of BBt - rmm::device_uvector Uhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); - rmm::device_uvector Uhat_dup(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream)); - raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream); - if (use_jacobi) - raft::linalg::eigJacobi( - handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps); - else - raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream); - raft::matrix::seqRoot(S_vec_tmp.data(), l, stream); - raft::matrix::sliceMatrix(S_vec_tmp.data(), - 1, - l, - S_vec, - 0, - p, - 1, - l, - stream); // Last k elements of S_vec - raft::matrix::colReverse(S_vec, 1, k, stream); + // compute eigendecomposition of BBt + rmm::device_uvector Uhat(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + rmm::device_uvector Uhat_dup(l * l, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream)); + raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream); + if (use_jacobi) + raft::linalg::eigJacobi( + handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps); + else + raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream); + raft::matrix::seqRoot(S_vec_tmp.data(), l, stream); + raft::matrix::sliceMatrix(S_vec_tmp.data(), + 1, + l, + S_vec, + 0, + p, + 1, + l, + stream); // Last k elements of S_vec + raft::matrix::colReverse(S_vec, 1, k, stream); - // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk - if (gen_left_vec) { - raft::linalg::gemm(handle, - Q.data(), - m, - l, - Uhat.data() + p * l, - U, - m, - k, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); - raft::matrix::colReverse(U, m, k, stream); - } + // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk + if (gen_left_vec) { + raft::linalg::gemm(handle, + Q.data(), + m, + l, + Uhat.data() + p * l, + U, + m, + k, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::colReverse(U, m, k, stream); + } - // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] * - // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk - if (gen_right_vec) { - rmm::device_uvector Sinv(k * k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream)); - rmm::device_uvector UhatSinv(l * k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream)); - raft::matrix::reciprocal(S_vec_tmp.data(), l, stream); - raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream); + // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] * + // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk + if (gen_right_vec) { + rmm::device_uvector Sinv(k * k, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream)); + rmm::device_uvector UhatSinv(l * k, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream)); + raft::matrix::reciprocal(S_vec_tmp.data(), l, stream); + raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream); - raft::linalg::gemm(handle, - Uhat.data() + p * l, - l, - k, - Sinv.data(), - UhatSinv.data(), - l, - k, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); - raft::linalg::gemm(handle, - B.data(), - l, - n, - UhatSinv.data(), - V, - n, - k, - CUBLAS_OP_T, - CUBLAS_OP_N, - alpha, - beta, - stream); - raft::matrix::colReverse(V, n, k, stream); - } - } - } + raft::linalg::gemm(handle, + Uhat.data() + p * l, + l, + k, + Sinv.data(), + UhatSinv.data(), + l, + k, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::linalg::gemm(handle, + B.data(), + l, + n, + UhatSinv.data(), + V, + n, + k, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::colReverse(V, n, k, stream); + } + } +} /** * @brief randomized singular value decomposition (RSVD) on the column major @@ -369,44 +368,45 @@ namespace raft { * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers * @param stream cuda stream */ - template - void rsvdPerc(const raft::handle_t &handle, - math_t *M, - int n_rows, - int n_cols, - math_t *S_vec, - math_t *U, - math_t *V, - math_t PC_perc, - math_t UpS_perc, - bool use_bbt, - bool gen_left_vec, - bool gen_right_vec, - bool use_jacobi, - math_t tol, - int max_sweeps, - cudaStream_t stream) { - int k = max((int) (min(n_rows, n_cols) * PC_perc), - 1); // Number of singular values to be computed - int p = max((int) (min(n_rows, n_cols) * UpS_perc), 1); // Upsamples - rsvdFixedRank(handle, - M, - n_rows, - n_cols, - S_vec, - U, - V, - k, - p, - use_bbt, - gen_left_vec, - gen_right_vec, - use_jacobi, - tol, - max_sweeps, - stream); - } +template +void rsvdPerc(const raft::handle_t& handle, + math_t* M, + int n_rows, + int n_cols, + math_t* S_vec, + math_t* U, + math_t* V, + math_t PC_perc, + math_t UpS_perc, + bool use_bbt, + bool gen_left_vec, + bool gen_right_vec, + bool use_jacobi, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ + int k = max((int)(min(n_rows, n_cols) * PC_perc), + 1); // Number of singular values to be computed + int p = max((int)(min(n_rows, n_cols) * UpS_perc), 1); // Upsamples + rsvdFixedRank(handle, + M, + n_rows, + n_cols, + S_vec, + U, + V, + k, + p, + use_bbt, + gen_left_vec, + gen_right_vec, + use_jacobi, + tol, + max_sweeps, + stream); +} - }; // end namespace detail - }; // end namespace linalg +}; // end namespace detail +}; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh index 935ffed190..bcfcc9df01 100644 --- a/cpp/include/raft/linalg/detail/ternary_op.cuh +++ b/cpp/include/raft/linalg/detail/ternary_op.cuh @@ -20,39 +20,41 @@ #include namespace raft { - namespace linalg { +namespace linalg { namespace detail { - template - __global__ void ternaryOpKernel( - math_t *out, const math_t *in1, const math_t *in2, const math_t *in3, IdxType len, Lambda op) { - typedef raft::TxN_t VecType; - VecType a, b, c; - IdxType idx = threadIdx.x + ((IdxType) blockIdx.x * blockDim.x); - idx *= VecType::Ratio; - if (idx >= len) return; - a.load(in1, idx); - b.load(in2, idx); - c.load(in3, idx); +template +__global__ void ternaryOpKernel( + math_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op) +{ + typedef raft::TxN_t VecType; + VecType a, b, c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= VecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); + c.load(in3, idx); #pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) { - a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]); - } - a.store(out, idx); - } + for (int i = 0; i < VecType::Ratio; ++i) { + a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]); + } + a.store(out, idx); +} - template - void ternaryOpImpl(math_t *out, - const math_t *in1, - const math_t *in2, - const math_t *in3, - IdxType len, - Lambda op, - cudaStream_t stream) { - const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType) TPB); - ternaryOpKernel - <<>>(out, in1, in2, in3, len, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } +template +void ternaryOpImpl(math_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, + IdxType len, + Lambda op, + cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); + ternaryOpKernel + <<>>(out, in1, in2, in3, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} /** * @brief perform element-wise ternary operation on the input arrays @@ -68,35 +70,36 @@ namespace detail { * @param op the device-lambda * @param stream cuda stream where to launch work */ - template - void ternaryOp(math_t *out, - const math_t *in1, - const math_t *in2, - const math_t *in3, - IdxType len, - Lambda op, - cudaStream_t stream) { - size_t bytes = len * sizeof(math_t); - if (16 / sizeof(math_t) && bytes % 16 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (8 / sizeof(math_t) && bytes % 8 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (4 / sizeof(math_t) && bytes % 4 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (2 / sizeof(math_t) && bytes % 2 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (1 / sizeof(math_t)) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else { - ternaryOpImpl(out, in1, in2, in3, len, op, stream); - } - } +template +void ternaryOp(math_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, + IdxType len, + Lambda op, + cudaStream_t stream) +{ + size_t bytes = len * sizeof(math_t); + if (16 / sizeof(math_t) && bytes % 16 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (8 / sizeof(math_t) && bytes % 8 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (4 / sizeof(math_t) && bytes % 4 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (2 / sizeof(math_t) && bytes % 2 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (1 / sizeof(math_t)) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else { + ternaryOpImpl(out, in1, in2, in3, len, op, stream); + } +} }; // end namespace detail - }; // end namespace linalg +}; // end namespace linalg }; // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp index bfa302eb4f..5540cca3a5 100644 --- a/cpp/include/raft/linalg/lstsq.hpp +++ b/cpp/include/raft/linalg/lstsq.hpp @@ -36,7 +36,7 @@ void lstsqSvdQR(const raft::handle_t& handle, math_t* w, cudaStream_t stream) { - detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream); + detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream); } /** Solves the linear ordinary least squares problem `Aw = b` @@ -54,7 +54,7 @@ void lstsqSvdJacobi(const raft::handle_t& handle, math_t* w, cudaStream_t stream) { - detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream); + detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream); } /** Solves the linear ordinary least squares problem `Aw = b` @@ -70,7 +70,7 @@ void lstsqEig(const raft::handle_t& handle, math_t* w, cudaStream_t stream) { - detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream); + detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream); } /** Solves the linear ordinary least squares problem `Aw = b` @@ -91,7 +91,7 @@ void lstsqQR(const raft::handle_t& handle, math_t* w, cudaStream_t stream) { - detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream); + detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream); } }; // namespace linalg diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh index 1a39d4b3ba..07760f0c5c 100644 --- a/cpp/include/raft/linalg/power.cuh +++ b/cpp/include/raft/linalg/power.cuh @@ -20,7 +20,7 @@ #include namespace raft { - namespace linalg { +namespace linalg { /** * @defgroup ScalarOps Scalar operations on the input buffer @@ -33,11 +33,11 @@ namespace raft { * @param stream cuda stream where to launch work * @{ */ -template -void powerScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp( - out, in, len,[scalar] __device__(math_t - in) { return raft::myPow(in, scalar); }, stream); +template +void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream); } /** @} */ @@ -52,12 +52,13 @@ void powerScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cuda * @param stream cuda stream where to launch work * @{ */ -template -void power(math_t *out, const math_t *in1, const math_t *in2, IdxType len, cudaStream_t stream) { - raft::linalg::binaryOp( - out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream); +template +void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp( + out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream); } /** @} */ - }; // end namespace linalg +}; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh index c6e163d491..82d272671c 100644 --- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh @@ -21,7 +21,6 @@ namespace raft { namespace linalg { - /** * @brief Computes the sum-reduction of matrix columns for each given key * @tparam T the input data type (as well as the output reduced matrix) @@ -40,15 +39,16 @@ namespace linalg { * @param nkeys number of unique keys in the keys array * @param stream cuda stream to launch the kernel onto */ -template -void reduce_cols_by_key(const T *data, +template +void reduce_cols_by_key(const T* data, const KeyIteratorT keys, - T *out, + T* out, IdxType nrows, IdxType ncols, IdxType nkeys, - cudaStream_t stream) { - detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream); + cudaStream_t stream) +{ + detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream); } }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index 3b5345a540..986f5e8a7f 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -39,18 +39,20 @@ namespace linalg { * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream */ -template +template void reduce_rows_by_key(const DataIteratorT d_A, int lda, const KeysIteratorT d_keys, - const WeightT *d_weights, - char *d_keys_char, + const WeightT* d_weights, + char* d_keys_char, int nrows, int ncols, int nkeys, DataIteratorT d_sums, - cudaStream_t stream) { - detail::reduce_rows_by_key(d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream); + cudaStream_t stream) +{ + detail::reduce_rows_by_key( + d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream); } /** @@ -69,29 +71,31 @@ void reduce_rows_by_key(const DataIteratorT d_A, * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream */ -template +template void reduce_rows_by_key(const DataIteratorT d_A, int lda, const KeysIteratorT d_keys, - char *d_keys_char, + char* d_keys_char, int nrows, int ncols, int nkeys, DataIteratorT d_sums, - cudaStream_t stream) { - typedef typename std::iterator_traits::value_type DataType; - reduce_rows_by_key(d_A, - lda, - d_keys, - static_cast(nullptr), - d_keys_char, - nrows, - ncols, - nkeys, - d_sums, - stream); + cudaStream_t stream) +{ + typedef typename std::iterator_traits::value_type DataType; + reduce_rows_by_key(d_A, + lda, + d_keys, + static_cast(nullptr), + d_keys_char, + nrows, + ncols, + nkeys, + d_sums, + stream); } }; // end namespace detail }; // end namespace linalg -}; // end namespace raft +} +; // end namespace raft diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh index e789abce30..d1d739489f 100644 --- a/cpp/include/raft/linalg/rsvd.cuh +++ b/cpp/include/raft/linalg/rsvd.cuh @@ -42,14 +42,14 @@ namespace linalg { * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers * @param stream cuda stream */ -template -void rsvdFixedRank(const raft::handle_t &handle, - math_t *M, +template +void rsvdFixedRank(const raft::handle_t& handle, + math_t* M, int n_rows, int n_cols, - math_t *S_vec, - math_t *U, - math_t *V, + math_t* S_vec, + math_t* U, + math_t* V, int k, int p, bool use_bbt, @@ -58,12 +58,26 @@ void rsvdFixedRank(const raft::handle_t &handle, bool use_jacobi, math_t tol, int max_sweeps, - cudaStream_t stream) { - - detail::rsvdFixedRank(handle, M, n_rows, n_cols, S_vec, U, V, k, p, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream); + cudaStream_t stream) +{ + detail::rsvdFixedRank(handle, + M, + n_rows, + n_cols, + S_vec, + U, + V, + k, + p, + use_bbt, + gen_left_vec, + gen_right_vec, + use_jacobi, + tol, + max_sweeps, + stream); } - /** * @brief randomized singular value decomposition (RSVD) on the column major * float type input matrix (Jacobi-based), by specifying the PC and upsampling @@ -85,14 +99,14 @@ void rsvdFixedRank(const raft::handle_t &handle, * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers * @param stream cuda stream */ -template -void rsvdPerc(const raft::handle_t &handle, - math_t *M, +template +void rsvdPerc(const raft::handle_t& handle, + math_t* M, int n_rows, int n_cols, - math_t *S_vec, - math_t *U, - math_t *V, + math_t* S_vec, + math_t* U, + math_t* V, math_t PC_perc, math_t UpS_perc, bool use_bbt, @@ -101,8 +115,24 @@ void rsvdPerc(const raft::handle_t &handle, bool use_jacobi, math_t tol, int max_sweeps, - cudaStream_t stream) { - detail::rsvdPerc(handle, M, n_rows, n_cols, S_vec, U, V, PC_perc, UpS_perc, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream); + cudaStream_t stream) +{ + detail::rsvdPerc(handle, + M, + n_rows, + n_cols, + S_vec, + U, + V, + PC_perc, + UpS_perc, + use_bbt, + gen_left_vec, + gen_right_vec, + use_jacobi, + tol, + max_sweeps, + stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh index 49eb6788ef..c431cfdcc0 100644 --- a/cpp/include/raft/linalg/sqrt.cuh +++ b/cpp/include/raft/linalg/sqrt.cuh @@ -35,8 +35,8 @@ namespace linalg { template void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp( - out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream); + raft::linalg::unaryOp( + out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh index 99e21fd5a0..be411e6492 100644 --- a/cpp/include/raft/linalg/ternary_op.cuh +++ b/cpp/include/raft/linalg/ternary_op.cuh @@ -34,15 +34,16 @@ namespace linalg { * @param op the device-lambda * @param stream cuda stream where to launch work */ -template -void ternaryOp(math_t *out, - const math_t *in1, - const math_t *in2, - const math_t *in3, +template +void ternaryOp(math_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, IdxType len, Lambda op, - cudaStream_t stream) { - detail::ternaryOp(out, in1, in2, in3, len, op, stream); + cudaStream_t stream) +{ + detail::ternaryOp(out, in1, in2, in3, len, op, stream); } }; // end namespace linalg diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu index 8f336d583f..13d9791992 100644 --- a/cpp/test/linalg/power.cu +++ b/cpp/test/linalg/power.cu @@ -16,115 +16,115 @@ #include "test_utils.h" #include -#include #include +#include #include namespace linalg { - template - __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); } - } - - template - void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) - { - static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naivePowerElemKernel<<>>(out, in1, in2, len); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - template - __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); } - } - - template - void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) - { - static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naivePowerScalarKernel<<>>(out, in1, in2, len); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - template - struct PowerInputs { - T tolerance; - int len; - unsigned long long int seed; - }; - - template - ::std::ostream& operator<<(::std::ostream& os, const PowerInputs& dims) - { - return os; - } - - template - class PowerTest : public ::testing::TestWithParam> { - protected: - PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {} - - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - raft::random::Rng r(params.seed); - int len = params.len; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - - in1.resize(len, stream); - in2.resize(len, stream); - out_ref.resize(len, stream); - out.resize(len, stream); - r.uniform(in1.data(), len, T(1.0), T(2.0), stream); - r.uniform(in2.data(), len, T(1.0), T(2.0), stream); - - naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream); - naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream); - - power(out.data(), in1.data(), in2.data(), len, stream); - powerScalar(out.data(), out.data(), T(2), len, stream); - power(in1.data(), in1.data(), in2.data(), len, stream); - powerScalar(in1.data(), in1.data(), T(2), len, stream); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); - } - - protected: - cudaStream_t stream = 0; - PowerInputs params; - rmm::device_uvector in1, in2, out_ref, out; - int device_count = 0; - }; - - const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; - - const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; - - typedef PowerTest PowerTestF; - TEST_P(PowerTestF, Result) - { - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); - } - - typedef PowerTest PowerTestD; - TEST_P(PowerTestD, Result) +template +__global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); } +} + +template +void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naivePowerElemKernel<<>>(out, in1, in2, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +__global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); } +} + +template +void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) +{ + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naivePowerScalarKernel<<>>(out, in1, in2, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +struct PowerInputs { + T tolerance; + int len; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const PowerInputs& dims) +{ + return os; +} + +template +class PowerTest : public ::testing::TestWithParam> { + protected: + PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.len; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + + in1.resize(len, stream); + in2.resize(len, stream); + out_ref.resize(len, stream); + out.resize(len, stream); + r.uniform(in1.data(), len, T(1.0), T(2.0), stream); + r.uniform(in2.data(), len, T(1.0), T(2.0), stream); + + naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream); + naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream); + + power(out.data(), in1.data(), in2.data(), len, stream); + powerScalar(out.data(), out.data(), T(2), len, stream); + power(in1.data(), in1.data(), in2.data(), len, stream); + powerScalar(in1.data(), in1.data(), T(2), len, stream); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + protected: + cudaStream_t stream = 0; + PowerInputs params; + rmm::device_uvector in1, in2, out_ref, out; + int device_count = 0; +}; + +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; + +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; + +typedef PowerTest PowerTestF; +TEST_P(PowerTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); +} + +typedef PowerTest PowerTestD; +TEST_P(PowerTestD, Result) { - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestF, ::testing::ValuesIn(inputsf2)); diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu index 55057b4894..072f586bac 100644 --- a/cpp/test/linalg/reduce_cols_by_key.cu +++ b/cpp/test/linalg/reduce_cols_by_key.cu @@ -16,107 +16,107 @@ #include "test_utils.h" #include -#include #include #include +#include #include namespace raft { - namespace linalg { +namespace linalg { - template - void naiveReduceColsByKey(const T* in, - const uint32_t* keys, - T* out_ref, - uint32_t nrows, - uint32_t ncols, - uint32_t nkeys, - cudaStream_t stream) - { - std::vector h_keys(ncols, 0u); - raft::copy(&(h_keys[0]), keys, ncols, stream); - std::vector h_in(nrows * ncols); - raft::copy(&(h_in[0]), in, nrows * ncols, stream); - raft::interruptible::synchronize(stream); - std::vector out(nrows * nkeys, T(0)); - for (uint32_t i = 0; i < nrows; ++i) { - for (uint32_t j = 0; j < ncols; ++j) { - out[i * nkeys + h_keys[j]] += h_in[i * ncols + j]; - } - } - raft::copy(out_ref, &(out[0]), nrows * nkeys, stream); - raft::interruptible::synchronize(stream); - } +template +void naiveReduceColsByKey(const T* in, + const uint32_t* keys, + T* out_ref, + uint32_t nrows, + uint32_t ncols, + uint32_t nkeys, + cudaStream_t stream) +{ + std::vector h_keys(ncols, 0u); + raft::copy(&(h_keys[0]), keys, ncols, stream); + std::vector h_in(nrows * ncols); + raft::copy(&(h_in[0]), in, nrows * ncols, stream); + raft::interruptible::synchronize(stream); + std::vector out(nrows * nkeys, T(0)); + for (uint32_t i = 0; i < nrows; ++i) { + for (uint32_t j = 0; j < ncols; ++j) { + out[i * nkeys + h_keys[j]] += h_in[i * ncols + j]; + } + } + raft::copy(out_ref, &(out[0]), nrows * nkeys, stream); + raft::interruptible::synchronize(stream); +} - template - struct ReduceColsInputs { - T tolerance; - uint32_t rows; - uint32_t cols; - uint32_t nkeys; - unsigned long long int seed; - }; +template +struct ReduceColsInputs { + T tolerance; + uint32_t rows; + uint32_t cols; + uint32_t nkeys; + unsigned long long int seed; +}; - template - ::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs& dims) - { - return os; - } +template +::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs& dims) +{ + return os; +} - template - class ReduceColsTest : public ::testing::TestWithParam> { - protected: - ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {} +template +class ReduceColsTest : public ::testing::TestWithParam> { + protected: + ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {} - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - raft::random::Rng r(params.seed); - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - auto nrows = params.rows; - auto ncols = params.cols; - auto nkeys = params.nkeys; - in.resize(nrows * ncols, stream); - keys.resize(ncols, stream); - out_ref.resize(nrows * nkeys, stream); - out.resize(nrows * nkeys, stream); - r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream); - r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream); - naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream); - reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream); - raft::interruptible::synchronize(stream); - } + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + auto nrows = params.rows; + auto ncols = params.cols; + auto nkeys = params.nkeys; + in.resize(nrows * ncols, stream); + keys.resize(ncols, stream); + out_ref.resize(nrows * nkeys, stream); + out.resize(nrows * nkeys, stream); + r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream); + r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream); + naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream); + reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream); + raft::interruptible::synchronize(stream); + } - void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } - protected: - cudaStream_t stream = 0; - ReduceColsInputs params; - rmm::device_uvector in, out_ref, out; - rmm::device_uvector keys; - }; + protected: + cudaStream_t stream = 0; + ReduceColsInputs params; + rmm::device_uvector in, out_ref, out; + rmm::device_uvector keys; +}; - const std::vector> inputsf = {{0.0001f, 128, 32, 6, 1234ULL}, - {0.0005f, 121, 63, 10, 1234ULL}}; - typedef ReduceColsTest ReduceColsTestF; - TEST_P(ReduceColsTestF, Result) - { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.rows * params.nkeys, - raft::CompareApprox(params.tolerance))); - } - INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf)); +const std::vector> inputsf = {{0.0001f, 128, 32, 6, 1234ULL}, + {0.0005f, 121, 63, 10, 1234ULL}}; +typedef ReduceColsTest ReduceColsTestF; +TEST_P(ReduceColsTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.rows * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf)); - const std::vector> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL}, - {0.0000001, 121, 63, 10, 1234ULL}}; - typedef ReduceColsTest ReduceColsTestD; - TEST_P(ReduceColsTestD, Result) +const std::vector> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL}, + {0.0000001, 121, 63, 10, 1234ULL}}; +typedef ReduceColsTest ReduceColsTestD; +TEST_P(ReduceColsTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.rows * params.nkeys, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.rows * params.nkeys, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestD, ::testing::ValuesIn(inputsd2)); diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu index e6dc8cef7f..1bda427e6f 100644 --- a/cpp/test/linalg/reduce_rows_by_key.cu +++ b/cpp/test/linalg/reduce_rows_by_key.cu @@ -17,246 +17,246 @@ #include "test_utils.h" #include #include -#include #include +#include #include namespace raft { - namespace linalg { +namespace linalg { - template - __global__ void naiveReduceRowsByKeyKernel(const Type* d_A, - int lda, - uint32_t* d_keys, - const Type* d_weight, - char* d_char_keys, - int nrows, - int ncols, - int nkeys, - Type* d_sums) - { - int c = threadIdx.x + blockIdx.x * blockDim.x; - if (c >= ncols) return; - int this_key = threadIdx.y + blockIdx.y * blockDim.y; +template +__global__ void naiveReduceRowsByKeyKernel(const Type* d_A, + int lda, + uint32_t* d_keys, + const Type* d_weight, + char* d_char_keys, + int nrows, + int ncols, + int nkeys, + Type* d_sums) +{ + int c = threadIdx.x + blockIdx.x * blockDim.x; + if (c >= ncols) return; + int this_key = threadIdx.y + blockIdx.y * blockDim.y; - Type sum = 0.0; - for (int r = 0; r < nrows; r++) { - if (this_key != d_keys[r]) continue; - Type wt = 1; - if (d_weight) wt = d_weight[r]; - sum += d_A[lda * r + c] * wt; - } - d_sums[this_key * ncols + c] = sum; - } - template - void naiveReduceRowsByKey(const Type* d_A, - int lda, - uint32_t* d_keys, - const Type* d_weight, - char* d_char_keys, - int nrows, - int ncols, - int nkeys, - Type* d_sums, - cudaStream_t stream) - { - cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols); + Type sum = 0.0; + for (int r = 0; r < nrows; r++) { + if (this_key != d_keys[r]) continue; + Type wt = 1; + if (d_weight) wt = d_weight[r]; + sum += d_A[lda * r + c] * wt; + } + d_sums[this_key * ncols + c] = sum; +} +template +void naiveReduceRowsByKey(const Type* d_A, + int lda, + uint32_t* d_keys, + const Type* d_weight, + char* d_char_keys, + int nrows, + int ncols, + int nkeys, + Type* d_sums, + cudaStream_t stream) +{ + cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols); - naiveReduceRowsByKeyKernel<<>>( - d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums); - } + naiveReduceRowsByKeyKernel<<>>( + d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums); +} - template - struct ReduceRowsInputs { - T tolerance; - int nobs; - uint32_t cols; - uint32_t nkeys; - unsigned long long int seed; - bool weighted; - T max_weight; - }; +template +struct ReduceRowsInputs { + T tolerance; + int nobs; + uint32_t cols; + uint32_t nkeys; + unsigned long long int seed; + bool weighted; + T max_weight; +}; - template - ::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs& dims) - { - return os; - } +template +::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs& dims) +{ + return os; +} - template - class ReduceRowTest : public ::testing::TestWithParam> { - public: - ReduceRowTest() - : params(::testing::TestWithParam>::GetParam()), - stream(handle.get_stream()), - in(params.nobs * params.cols, stream), - out(params.nkeys * params.cols, stream), - out_ref(params.nkeys * params.cols, stream), - keys(params.nobs, stream), - scratch_buf(params.nobs, stream) - { - } +template +class ReduceRowTest : public ::testing::TestWithParam> { + public: + ReduceRowTest() + : params(::testing::TestWithParam>::GetParam()), + stream(handle.get_stream()), + in(params.nobs * params.cols, stream), + out(params.nkeys * params.cols, stream), + out_ref(params.nkeys * params.cols, stream), + keys(params.nobs, stream), + scratch_buf(params.nobs, stream) + { + } - protected: - void SetUp() override - { - raft::random::Rng r(params.seed); - raft::random::Rng r_int(params.seed); + protected: + void SetUp() override + { + raft::random::Rng r(params.seed); + raft::random::Rng r_int(params.seed); - int nobs = params.nobs; - uint32_t cols = params.cols; - uint32_t nkeys = params.nkeys; - r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream); - r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream); + int nobs = params.nobs; + uint32_t cols = params.cols; + uint32_t nkeys = params.nkeys; + r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream); + r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream); - rmm::device_uvector weight(0, stream); - if (params.weighted) { - weight.resize(nobs, stream); - raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox); - r.uniform(weight.data(), nobs, T(1), params.max_weight, stream); - } + rmm::device_uvector weight(0, stream); + if (params.weighted) { + weight.resize(nobs, stream); + raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox); + r.uniform(weight.data(), nobs, T(1), params.max_weight, stream); + } - naiveReduceRowsByKey(in.data(), - cols, - keys.data(), - params.weighted ? weight.data() : nullptr, - scratch_buf.data(), - nobs, - cols, - nkeys, - out_ref.data(), - stream); - if (params.weighted) { - reduce_rows_by_key(in.data(), - cols, - keys.data(), - params.weighted ? weight.data() : nullptr, - scratch_buf.data(), - nobs, - cols, - nkeys, - out.data(), - stream); - } else { - reduce_rows_by_key( - in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream); - } - handle.sync_stream(stream); - } + naiveReduceRowsByKey(in.data(), + cols, + keys.data(), + params.weighted ? weight.data() : nullptr, + scratch_buf.data(), + nobs, + cols, + nkeys, + out_ref.data(), + stream); + if (params.weighted) { + reduce_rows_by_key(in.data(), + cols, + keys.data(), + params.weighted ? weight.data() : nullptr, + scratch_buf.data(), + nobs, + cols, + nkeys, + out.data(), + stream); + } else { + reduce_rows_by_key( + in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream); + } + handle.sync_stream(stream); + } - protected: - ReduceRowsInputs params; - raft::handle_t handle; - cudaStream_t stream = 0; + protected: + ReduceRowsInputs params; + raft::handle_t handle; + cudaStream_t stream = 0; - int device_count = 0; - rmm::device_uvector in, out, out_ref; - rmm::device_uvector keys; - rmm::device_uvector scratch_buf; - }; + int device_count = 0; + rmm::device_uvector in, out, out_ref; + rmm::device_uvector keys; + rmm::device_uvector scratch_buf; +}; // ReduceRowTestF // 128 Obs, 32 cols, 6 clusters - const std::vector> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false}, - {0.000001f, 128, 32, 6, 1234ULL, true, 1.0}, - {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}}; - typedef ReduceRowTest ReduceRowTestF; - TEST_P(ReduceRowTestF, Result) - { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.cols * params.nkeys, - raft::CompareApprox(params.tolerance))); - } - INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2)); +const std::vector> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false}, + {0.000001f, 128, 32, 6, 1234ULL, true, 1.0}, + {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}}; +typedef ReduceRowTest ReduceRowTestF; +TEST_P(ReduceRowTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2)); // ReduceRowTestD // 128 Obs, 32 cols, 6 clusters, double precision - const std::vector> inputsd2 = { - {0.00000001, 128, 32, 6, 1234ULL, false}, - {0.00000001, 128, 32, 6, 1234ULL, true, 2.0}, - {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}}; - typedef ReduceRowTest ReduceRowTestD; - TEST_P(ReduceRowTestD, Result) +const std::vector> inputsd2 = { + {0.00000001, 128, 32, 6, 1234ULL, false}, + {0.00000001, 128, 32, 6, 1234ULL, true, 2.0}, + {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}}; +typedef ReduceRowTest ReduceRowTestD; +TEST_P(ReduceRowTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.cols * params.nkeys, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestD, ::testing::ValuesIn(inputsd2)); // ReduceRowTestSmallnKey // 128 Obs, 32 cols, 3 clusters const std::vector> inputsf_small_nkey = { - {0.000001f, 128, 32, 3, 1234ULL, false}, - {0.000001f, 128, 32, 3, 1234ULL, true, 5.0}, - {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}}; + {0.000001f, 128, 32, 3, 1234ULL, false}, + {0.000001f, 128, 32, 3, 1234ULL, true, 5.0}, + {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}}; typedef ReduceRowTest ReduceRowTestSmallnKey; TEST_P(ReduceRowTestSmallnKey, Result) { -ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.cols * params.nkeys, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ReduceRowTests, - ReduceRowTestSmallnKey, - ::testing::ValuesIn(inputsf_small_nkey)); + ReduceRowTestSmallnKey, + ::testing::ValuesIn(inputsf_small_nkey)); // ReduceRowTestBigSpace // 512 Obs, 1024 cols, 32 clusters, double precision const std::vector> inputsd_big_space = { - {0.00000001, 512, 1024, 40, 1234ULL, false}, - {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0}, - {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}}; + {0.00000001, 512, 1024, 40, 1234ULL, false}, + {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0}, + {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}}; typedef ReduceRowTest ReduceRowTestBigSpace; TEST_P(ReduceRowTestBigSpace, Result) { -ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.cols * params.nkeys, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ReduceRowTests, - ReduceRowTestBigSpace, - ::testing::ValuesIn(inputsd_big_space)); + ReduceRowTestBigSpace, + ::testing::ValuesIn(inputsd_big_space)); // ReduceRowTestManyObs // 100000 Obs, 37 cols, 32 clusters const std::vector> inputsf_many_obs = { - {0.00001f, 100000, 37, 32, 1234ULL, false}, - {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0}, - {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}}; + {0.00001f, 100000, 37, 32, 1234ULL, false}, + {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0}, + {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}}; typedef ReduceRowTest ReduceRowTestManyObs; TEST_P(ReduceRowTestManyObs, Result) { -ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.cols * params.nkeys, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ReduceRowTests, - ReduceRowTestManyObs, - ::testing::ValuesIn(inputsf_many_obs)); + ReduceRowTestManyObs, + ::testing::ValuesIn(inputsf_many_obs)); // ReduceRowTestManyClusters // 100000 Obs, 37 cols, 2048 clusters const std::vector> inputsf_many_cluster = { - {0.00001f, 100000, 37, 2048, 1234ULL, false}, - {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0}, - {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}}; + {0.00001f, 100000, 37, 2048, 1234ULL, false}, + {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0}, + {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}}; typedef ReduceRowTest ReduceRowTestManyClusters; TEST_P(ReduceRowTestManyClusters, Result) { -ASSERT_TRUE(raft::devArrMatch(out_ref.data(), - out.data(), - params.cols * params.nkeys, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref.data(), + out.data(), + params.cols * params.nkeys, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ReduceRowTests, - ReduceRowTestManyClusters, - ::testing::ValuesIn(inputsf_many_cluster)); + ReduceRowTestManyClusters, + ::testing::ValuesIn(inputsf_many_cluster)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu index 260ea07268..7a315ddde6 100644 --- a/cpp/test/linalg/rsvd.cu +++ b/cpp/test/linalg/rsvd.cu @@ -16,283 +16,283 @@ #include "test_utils.h" #include -#include #include #include #include +#include #include #include namespace raft { - namespace linalg { - - template - struct RsvdInputs { - T tolerance; - int n_row; - int n_col; - T PC_perc; - T UpS_perc; - int k; - int p; - bool use_bbt; - unsigned long long int seed; - }; - - template - ::std::ostream& operator<<(::std::ostream& os, const RsvdInputs& dims) - { - return os; - } - - template - class RsvdTest : public ::testing::TestWithParam> { - protected: - RsvdTest() - : A(0, stream), - U(0, stream), - S(0, stream), - V(0, stream), - left_eig_vectors_ref(0, stream), - right_eig_vectors_ref(0, stream), - sing_vals_ref(0, stream) - { - } - - void SetUp() override - { - raft::handle_t handle; - stream = handle.get_stream(); - - params = ::testing::TestWithParam>::GetParam(); - // rSVD seems to be very sensitive to the random number sequence as well! - raft::random::Rng r(params.seed, raft::random::GenTaps); - int m = params.n_row, n = params.n_col; - T eig_svd_tol = 1.e-7; - int max_sweeps = 100; - - T mu = 0.0, sigma = 1.0; - A.resize(m * n, stream); - if (params.tolerance > 1) { // Sanity check - ASSERT(m == 3, "This test only supports mxn=3x2!"); - ASSERT(m * n == 6, "This test only supports mxn=3x2!"); - T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; - raft::update_device(A.data(), data_h, m * n, stream); - - T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695}; - T right_eig_vectors_ref_h[] = {-0.638636, -0.769509}; - T sing_vals_ref_h[] = {7.065283}; - - left_eig_vectors_ref.resize(m, stream); - right_eig_vectors_ref.resize(n, stream); - sing_vals_ref.resize(1, stream); - - raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream); - raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream); - raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream); - - } else { // Other normal tests - r.normal(A.data(), m * n, mu, sigma, stream); - } - std::vector A_backup_cpu(m * - n); // Backup A matrix as svdJacobi will destroy the content of A - raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream); - - if (params.k == 0) { - params.k = max((int)(min(m, n) * params.PC_perc), 1); - params.p = max((int)(min(m, n) * params.UpS_perc), 1); - } - - U.resize(m * params.k, stream); - S.resize(params.k, stream); - V.resize(n * params.k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream)); - - // RSVD tests - if (params.k == 0) { // Test with PC and upsampling ratio - rsvdPerc(handle, - A.data(), - m, - n, - S.data(), - U.data(), - V.data(), - params.PC_perc, - params.UpS_perc, - params.use_bbt, - true, - true, - false, - eig_svd_tol, - max_sweeps, - stream); - } else { // Test with directly given fixed rank - rsvdFixedRank(handle, - A.data(), - m, - n, - S.data(), - U.data(), - V.data(), - params.k, - params.p, - params.use_bbt, - true, - true, - true, - eig_svd_tol, - max_sweeps, - stream); - } - raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream); - } - - protected: - cudaStream_t stream = 0; - RsvdInputs params; - rmm::device_uvector A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; - }; - - const std::vector> inputs_fx = { - // Test with ratios - {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT - {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT - {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT - {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT - - , // Test with fixed ranks - {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT - {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT - {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT - {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT - {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT - {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT - {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT - {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT - }; - - const std::vector> inputs_dx = { - // Test with ratios - {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT - {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT - {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT - {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT - - , // Test with fixed ranks - {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT - {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT - {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT - {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT - {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT - {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT - {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT - {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT - }; - - const std::vector> sanity_inputs_fx = { - {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; - - const std::vector> sanity_inputs_dx = { - {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL}, - {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL}, - {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL}, - {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}}; - - typedef RsvdTest RsvdSanityCheckValF; - TEST_P(RsvdSanityCheckValF, Result) - { - ASSERT_TRUE(devArrMatch( - sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs(params.tolerance))); +namespace linalg { + +template +struct RsvdInputs { + T tolerance; + int n_row; + int n_col; + T PC_perc; + T UpS_perc; + int k; + int p; + bool use_bbt; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const RsvdInputs& dims) +{ + return os; +} + +template +class RsvdTest : public ::testing::TestWithParam> { + protected: + RsvdTest() + : A(0, stream), + U(0, stream), + S(0, stream), + V(0, stream), + left_eig_vectors_ref(0, stream), + right_eig_vectors_ref(0, stream), + sing_vals_ref(0, stream) + { + } + + void SetUp() override + { + raft::handle_t handle; + stream = handle.get_stream(); + + params = ::testing::TestWithParam>::GetParam(); + // rSVD seems to be very sensitive to the random number sequence as well! + raft::random::Rng r(params.seed, raft::random::GenTaps); + int m = params.n_row, n = params.n_col; + T eig_svd_tol = 1.e-7; + int max_sweeps = 100; + + T mu = 0.0, sigma = 1.0; + A.resize(m * n, stream); + if (params.tolerance > 1) { // Sanity check + ASSERT(m == 3, "This test only supports mxn=3x2!"); + ASSERT(m * n == 6, "This test only supports mxn=3x2!"); + T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; + raft::update_device(A.data(), data_h, m * n, stream); + + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695}; + T right_eig_vectors_ref_h[] = {-0.638636, -0.769509}; + T sing_vals_ref_h[] = {7.065283}; + + left_eig_vectors_ref.resize(m, stream); + right_eig_vectors_ref.resize(n, stream); + sing_vals_ref.resize(1, stream); + + raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream); + raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream); + raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream); + + } else { // Other normal tests + r.normal(A.data(), m * n, mu, sigma, stream); } + std::vector A_backup_cpu(m * + n); // Backup A matrix as svdJacobi will destroy the content of A + raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream); + + if (params.k == 0) { + params.k = max((int)(min(m, n) * params.PC_perc), 1); + params.p = max((int)(min(m, n) * params.UpS_perc), 1); + } + + U.resize(m * params.k, stream); + S.resize(params.k, stream); + V.resize(n * params.k, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream)); + + // RSVD tests + if (params.k == 0) { // Test with PC and upsampling ratio + rsvdPerc(handle, + A.data(), + m, + n, + S.data(), + U.data(), + V.data(), + params.PC_perc, + params.UpS_perc, + params.use_bbt, + true, + true, + false, + eig_svd_tol, + max_sweeps, + stream); + } else { // Test with directly given fixed rank + rsvdFixedRank(handle, + A.data(), + m, + n, + S.data(), + U.data(), + V.data(), + params.k, + params.p, + params.use_bbt, + true, + true, + true, + eig_svd_tol, + max_sweeps, + stream); + } + raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream); + } + + protected: + cudaStream_t stream = 0; + RsvdInputs params; + rmm::device_uvector A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; +}; + +const std::vector> inputs_fx = { + // Test with ratios + {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT + {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT + {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT +}; + +const std::vector> inputs_dx = { + // Test with ratios + {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT + {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT + {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT +}; + +const std::vector> sanity_inputs_fx = { + {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; + +const std::vector> sanity_inputs_dx = { + {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL}, + {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}}; + +typedef RsvdTest RsvdSanityCheckValF; +TEST_P(RsvdSanityCheckValF, Result) +{ + ASSERT_TRUE(devArrMatch( + sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs(params.tolerance))); +} - typedef RsvdTest RsvdSanityCheckValD; - TEST_P(RsvdSanityCheckValD, Result) +typedef RsvdTest RsvdSanityCheckValD; +TEST_P(RsvdSanityCheckValD, Result) { - ASSERT_TRUE(devArrMatch( - sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs(params.tolerance))); + ASSERT_TRUE(devArrMatch( + sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs(params.tolerance))); } typedef RsvdTest RsvdSanityCheckLeftVecF; TEST_P(RsvdSanityCheckLeftVecF, Result) { -ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(), - U.data(), - params.n_row * params.k, - raft::CompareApproxAbs(params.tolerance))); + ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(), + U.data(), + params.n_row * params.k, + raft::CompareApproxAbs(params.tolerance))); } typedef RsvdTest RsvdSanityCheckLeftVecD; TEST_P(RsvdSanityCheckLeftVecD, Result) { -ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(), - U.data(), - params.n_row * params.k, - raft::CompareApproxAbs(params.tolerance))); + ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(), + U.data(), + params.n_row * params.k, + raft::CompareApproxAbs(params.tolerance))); } typedef RsvdTest RsvdSanityCheckRightVecF; TEST_P(RsvdSanityCheckRightVecF, Result) { -ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(), - V.data(), - params.n_col * params.k, - raft::CompareApproxAbs(params.tolerance))); + ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(), + V.data(), + params.n_col * params.k, + raft::CompareApproxAbs(params.tolerance))); } typedef RsvdTest RsvdSanityCheckRightVecD; TEST_P(RsvdSanityCheckRightVecD, Result) { -ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(), - V.data(), - params.n_col * params.k, - raft::CompareApproxAbs(params.tolerance))); + ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(), + V.data(), + params.n_col * params.k, + raft::CompareApproxAbs(params.tolerance))); } typedef RsvdTest RsvdTestSquareMatrixNormF; TEST_P(RsvdTestSquareMatrixNormF, Result) { -raft::handle_t handle; - -ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle, - A.data(), - U.data(), - S.data(), - V.data(), - params.n_row, - params.n_col, - params.k, - 4 * params.tolerance, - handle.get_stream())); + raft::handle_t handle; + + ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle, + A.data(), + U.data(), + S.data(), + V.data(), + params.n_row, + params.n_col, + params.k, + 4 * params.tolerance, + handle.get_stream())); } typedef RsvdTest RsvdTestSquareMatrixNormD; TEST_P(RsvdTestSquareMatrixNormD, Result) { -raft::handle_t handle; - -ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle, - A.data(), - U.data(), - S.data(), - V.data(), - params.n_row, - params.n_col, - params.k, - 4 * params.tolerance, - handle.get_stream())); + raft::handle_t handle; + + ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle, + A.data(), + U.data(), + S.data(), + V.data(), + params.n_row, + params.n_col, + params.k, + 4 * params.tolerance, + handle.get_stream())); } INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValF, ::testing::ValuesIn(sanity_inputs_fx)); diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu index bf64d264ad..f604a8a1ef 100644 --- a/cpp/test/linalg/sqrt.cu +++ b/cpp/test/linalg/sqrt.cu @@ -21,94 +21,94 @@ #include namespace raft { - namespace linalg { - - template - __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); } - } - - template - void naiveSqrtElem(Type* out, const Type* in1, int len) - { - static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveSqrtElemKernel<<>>(out, in1, len); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - template - struct SqrtInputs { - T tolerance; - int len; - unsigned long long int seed; - }; - - template - ::std::ostream& operator<<(::std::ostream& os, const SqrtInputs& dims) - { - return os; - } - - template - class SqrtTest : public ::testing::TestWithParam> { - protected: - SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {} - - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - raft::random::Rng r(params.seed); - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - int len = params.len; - in1.resize(len, stream); - out_ref.resize(len, stream); - out.resize(len, stream); - r.uniform(in1.data(), len, T(1.0), T(2.0), stream); - - naiveSqrtElem(out_ref.data(), in1.data(), len); - - sqrt(out.data(), in1.data(), len, stream); - sqrt(in1.data(), in1.data(), len, stream); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); - } - - protected: - cudaStream_t stream = 0; - SqrtInputs params; - rmm::device_uvector in1, out_ref, out; - int device_count = 0; - }; - - const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; - - const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; - - typedef SqrtTest SqrtTestF; - TEST_P(SqrtTestF, Result) - { - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); - } - - typedef SqrtTest SqrtTestD; - TEST_P(SqrtTestD, Result) +namespace linalg { + +template +__global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); } +} + +template +void naiveSqrtElem(Type* out, const Type* in1, int len) +{ + static const int TPB = 64; + int nblks = raft::ceildiv(len, TPB); + naiveSqrtElemKernel<<>>(out, in1, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +struct SqrtInputs { + T tolerance; + int len; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const SqrtInputs& dims) +{ + return os; +} + +template +class SqrtTest : public ::testing::TestWithParam> { + protected: + SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + int len = params.len; + in1.resize(len, stream); + out_ref.resize(len, stream); + out.resize(len, stream); + r.uniform(in1.data(), len, T(1.0), T(2.0), stream); + + naiveSqrtElem(out_ref.data(), in1.data(), len); + + sqrt(out.data(), in1.data(), len, stream); + sqrt(in1.data(), in1.data(), len, stream); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + protected: + cudaStream_t stream = 0; + SqrtInputs params; + rmm::device_uvector in1, out_ref, out; + int device_count = 0; +}; + +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; + +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; + +typedef SqrtTest SqrtTestF; +TEST_P(SqrtTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); +} + +typedef SqrtTest SqrtTestD; +TEST_P(SqrtTestD, Result) { - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch( - out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestD, ::testing::ValuesIn(inputsd2)); -} // end namespace LinAlg -} // end namespace MLCommon +} // namespace linalg +} // namespace raft diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu index 83ec3e6029..168b0cd31d 100644 --- a/cpp/test/linalg/ternary_op.cu +++ b/cpp/test/linalg/ternary_op.cu @@ -16,90 +16,90 @@ #include "test_utils.h" #include -#include #include +#include #include namespace raft { - namespace linalg { +namespace linalg { - template - struct BinaryOpInputs { - InType tolerance; - IdxType len; - unsigned long long int seed; - }; +template +struct BinaryOpInputs { + InType tolerance; + IdxType len; + unsigned long long int seed; +}; - template - ::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) - { - return os; - } +template +::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) +{ + return os; +} - template - class ternaryOpTest : public ::testing::TestWithParam> { - public: - ternaryOpTest() - : params(::testing::TestWithParam>::GetParam()), - stream(handle.get_stream()), - out_add_ref(params.len, stream), - out_add(params.len, stream), - out_mul_ref(params.len, stream), - out_mul(params.len, stream) - { - } +template +class ternaryOpTest : public ::testing::TestWithParam> { + public: + ternaryOpTest() + : params(::testing::TestWithParam>::GetParam()), + stream(handle.get_stream()), + out_add_ref(params.len, stream), + out_add(params.len, stream), + out_mul_ref(params.len, stream), + out_mul(params.len, stream) + { + } - void SetUp() override - { - raft::random::Rng rng(params.seed); - int len = params.len; - rmm::device_uvector in1(len, stream); - rmm::device_uvector in2(len, stream); - rmm::device_uvector in3(len, stream); + void SetUp() override + { + raft::random::Rng rng(params.seed); + int len = params.len; + rmm::device_uvector in1(len, stream); + rmm::device_uvector in2(len, stream); + rmm::device_uvector in3(len, stream); - rng.fill(out_add_ref.data(), len, T(6.0), stream); - rng.fill(out_mul_ref.data(), len, T(6.0), stream); - rng.fill(in1.data(), len, T(1.0), stream); - rng.fill(in2.data(), len, T(2.0), stream); - rng.fill(in3.data(), len, T(3.0), stream); + rng.fill(out_add_ref.data(), len, T(6.0), stream); + rng.fill(out_mul_ref.data(), len, T(6.0), stream); + rng.fill(in1.data(), len, T(1.0), stream); + rng.fill(in2.data(), len, T(2.0), stream); + rng.fill(in3.data(), len, T(3.0), stream); - auto add = [] __device__(T a, T b, T c) { return a + b + c; }; - auto mul = [] __device__(T a, T b, T c) { return a * b * c; }; - ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream); - ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream); - } + auto add = [] __device__(T a, T b, T c) { return a + b + c; }; + auto mul = [] __device__(T a, T b, T c) { return a * b * c; }; + ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream); + ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream); + } - protected: - BinaryOpInputs params; - raft::handle_t handle; - cudaStream_t stream = 0; + protected: + BinaryOpInputs params; + raft::handle_t handle; + cudaStream_t stream = 0; - rmm::device_uvector out_add_ref, out_add, out_mul_ref, out_mul; - }; + rmm::device_uvector out_add_ref, out_add, out_mul_ref, out_mul; +}; - const std::vector> inputsf = {{0.000001f, 1024 * 1024, 1234ULL}, - {0.000001f, 1024 * 1024 + 2, 1234ULL}, - {0.000001f, 1024 * 1024 + 1, 1234ULL}}; - typedef ternaryOpTest ternaryOpTestF; - TEST_P(ternaryOpTestF, Result) - { - ASSERT_TRUE(devArrMatch( - out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch( - out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox(params.tolerance))); - } - INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf)); +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 1234ULL}, + {0.000001f, 1024 * 1024 + 2, 1234ULL}, + {0.000001f, 1024 * 1024 + 1, 1234ULL}}; +typedef ternaryOpTest ternaryOpTestF; +TEST_P(ternaryOpTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf)); - const std::vector> inputsd = {{0.00000001, 1024 * 1024, 1234ULL}, - {0.00000001, 1024 * 1024 + 2, 1234ULL}, - {0.00000001, 1024 * 1024 + 1, 1234ULL}}; - typedef ternaryOpTest ternaryOpTestD; - TEST_P(ternaryOpTestD, Result) +const std::vector> inputsd = {{0.00000001, 1024 * 1024, 1234ULL}, + {0.00000001, 1024 * 1024 + 2, 1234ULL}, + {0.00000001, 1024 * 1024 + 1, 1234ULL}}; +typedef ternaryOpTest ternaryOpTestD; +TEST_P(ternaryOpTestD, Result) { - ASSERT_TRUE(devArrMatch( - out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch( - out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestD, ::testing::ValuesIn(inputsd)); From 4909d2c1a005d169e55488a087c38245c67a23a5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 19:16:07 -0500 Subject: [PATCH 03/24] Updarting style --- cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh | 5 +++-- cpp/include/raft/linalg/power.cuh | 1 + cpp/test/linalg/power.cu | 1 + cpp/test/linalg/sqrt.cu | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index c88895807d..680c95f1f4 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -301,8 +301,9 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT // same for the whole block sh_key_inx++; #else - if (d_keys[r] != global_key) continue; // No divergence since global_key is the - // same for the whole block + if (d_keys[r] != global_key) + continue; // No divergence since global_key is the + // same for the whole block #endif // if ((end_row-start_row) / (r-start_row) != global_key) continue; DataType val = __ldcg(&d_A[r * lda + this_col]); diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh index 07760f0c5c..d17fa9a043 100644 --- a/cpp/include/raft/linalg/power.cuh +++ b/cpp/include/raft/linalg/power.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include namespace raft { diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu index 13d9791992..d3c76e1049 100644 --- a/cpp/test/linalg/power.cu +++ b/cpp/test/linalg/power.cu @@ -20,6 +20,7 @@ #include #include +namespace raft { namespace linalg { template diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu index f604a8a1ef..27fa0f8959 100644 --- a/cpp/test/linalg/sqrt.cu +++ b/cpp/test/linalg/sqrt.cu @@ -16,8 +16,8 @@ #include "test_utils.h" #include -#include #include +#include #include namespace raft { From df48d3405f86feea9ac405aeb6a2ededfb045b10 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 19:29:51 -0500 Subject: [PATCH 04/24] Fixing include for test utils --- cpp/test/linalg/power.cu | 2 +- cpp/test/linalg/reduce_cols_by_key.cu | 2 +- cpp/test/linalg/reduce_rows_by_key.cu | 2 +- cpp/test/linalg/rsvd.cu | 2 +- cpp/test/linalg/sqrt.cu | 2 +- cpp/test/linalg/ternary_op.cu | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu index d3c76e1049..8c862bbeab 100644 --- a/cpp/test/linalg/power.cu +++ b/cpp/test/linalg/power.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu index 072f586bac..94459769f8 100644 --- a/cpp/test/linalg/reduce_cols_by_key.cu +++ b/cpp/test/linalg/reduce_cols_by_key.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu index 1bda427e6f..9219c4f561 100644 --- a/cpp/test/linalg/reduce_rows_by_key.cu +++ b/cpp/test/linalg/reduce_rows_by_key.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu index 7a315ddde6..b8e44580b5 100644 --- a/cpp/test/linalg/rsvd.cu +++ b/cpp/test/linalg/rsvd.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu index 27fa0f8959..6aa6376c26 100644 --- a/cpp/test/linalg/sqrt.cu +++ b/cpp/test/linalg/sqrt.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu index 168b0cd31d..4140a9c4b3 100644 --- a/cpp/test/linalg/ternary_op.cu +++ b/cpp/test/linalg/ternary_op.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include From 3c2fc7e31c222978378b23a18fd8fe369822285d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 19:34:44 -0500 Subject: [PATCH 05/24] Updating lstsq --- cpp/include/raft/linalg/detail/lstsq.hpp | 61 +++++++++++++----------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp index c91d6e41c1..6553394cc4 100644 --- a/cpp/include/raft/linalg/detail/lstsq.hpp +++ b/cpp/include/raft/linalg/detail/lstsq.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -50,7 +51,7 @@ struct DeviceEvent { DeviceEvent(bool concurrent) { if (concurrent) - RAFT_CUDA_TRY(cudaEventCreate(&e)); + RAFT_CUDA_TRY(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); else e = nullptr; } @@ -60,23 +61,16 @@ struct DeviceEvent { if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e)); } - operator cudaEvent_t() const { return e; } - void record(cudaStream_t stream) { if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream)); } - void wait(cudaStream_t stream) + void wait_by(cudaStream_t stream) { if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u)); } - void wait() - { - if (e != nullptr) raft::interruptible::synchronize(e); - } - DeviceEvent& operator=(const DeviceEvent& other) = delete; }; @@ -265,27 +259,26 @@ void lstsqEig(const raft::handle_t& handle, cudaStream_t stream) { rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream); - rmm::cuda_stream_view multAbStream = mainStream; - bool concurrent = false; - { - int sp_size = handle.get_stream_pool_size(); - if (sp_size > 0) { - multAbStream = handle.get_stream_from_stream_pool(0); - // check if the two streams can run concurrently - if (!are_implicitly_synchronized(mainStream, multAbStream)) { - concurrent = true; - } else if (sp_size > 1) { - mainStream = multAbStream; - multAbStream = handle.get_stream_from_stream_pool(1); - concurrent = true; - } - } + rmm::cuda_stream_view multAbStream = handle.get_next_usable_stream(); + bool concurrent; + // Check if the two streams can run concurrently. This is needed because a legacy default stream + // would synchronize with other blocking streams. To avoid synchronization in such case, we try to + // use an additional stream from the pool. + if (!are_implicitly_synchronized(mainStream, multAbStream)) { + concurrent = true; + } else if (handle.get_stream_pool_size() > 1) { + mainStream = handle.get_next_usable_stream(); + concurrent = true; + } else { + multAbStream = mainStream; + concurrent = false; } - // the event is created only if the given raft handle is capable of running - // at least two CUDA streams without implicit synchronization. - DeviceEvent multAbDone(concurrent); rmm::device_uvector workset(n_cols * n_cols * 3 + n_cols * 2, mainStream); + // the event is created only if the given raft handle is capable of running + // at least two CUDA streams without implicit synchronization. + DeviceEvent worksetDone(concurrent); + worksetDone.record(mainStream); math_t* Q = workset.data(); math_t* QS = Q + n_cols * n_cols; math_t* covA = QS + n_cols * n_cols; @@ -310,7 +303,9 @@ void lstsqEig(const raft::handle_t& handle, mainStream); // Ab <- A* b + worksetDone.wait_by(multAbStream); raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream); + DeviceEvent multAbDone(concurrent); multAbDone.record(multAbStream); // Q S Q* <- covA @@ -335,9 +330,18 @@ void lstsqEig(const raft::handle_t& handle, alpha, beta, mainStream); - multAbDone.wait(mainStream); + + multAbDone.wait_by(mainStream); // w <- covA Ab == Q invS Q* A b == inv(A* A) A b raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream); + + // This event is created only if we use two worker streams, and `stream` is not the legacy stream, + // and `mainStream` is not a non-blocking stream. In fact, with the current logic these conditions + // are impossible together, but it still makes sense to put this construct here to emphasize that + // `stream` must wait till the work here is done (for future refactorings). + DeviceEvent mainDone(!are_implicitly_synchronized(mainStream, stream)); + mainDone.record(mainStream); + mainDone.wait_by(stream); } /** Solves the linear ordinary least squares problem `Aw = b` @@ -448,7 +452,6 @@ void lstsqQR(const raft::handle_t& handle, RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream)); } - }; // namespace detail }; // namespace linalg }; // namespace raft From 0b5ba541ed0f5ee202d6faa237839f73d9e8bdfa Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 20:22:55 -0500 Subject: [PATCH 06/24] Adding missing reduction test --- cpp/test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index c0db20f650..2ace88b498 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -59,6 +59,7 @@ add_executable(test_raft test/linalg/power.cu test/linalg/reduce.cu test/linalg/reduce_cols_by_key.cu + test/linalg/reduce_rows_by_key.cu test/linalg/rsvd.cu test/linalg/sqrt.cu test/linalg/strided_reduction.cu From 1b613b2c974845e03fa3b2f26f4389da898f6d1a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 20:41:51 -0500 Subject: [PATCH 07/24] Fixing lstsq --- cpp/include/raft/linalg/lstsq.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp index 5540cca3a5..cdf67e422d 100644 --- a/cpp/include/raft/linalg/lstsq.hpp +++ b/cpp/include/raft/linalg/lstsq.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { namespace linalg { From c7f059fb6f0646edd7dce5768c391412fb6e4db7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 21:42:10 -0500 Subject: [PATCH 08/24] Typo --- cpp/include/raft/linalg/reduce_rows_by_key.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index 986f5e8a7f..b97a25dbac 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -95,7 +95,5 @@ void reduce_rows_by_key(const DataIteratorT d_A, stream); } -}; // end namespace detail }; // end namespace linalg -} -; // end namespace raft +}; // end namespace raft From 3f207c59975f377a4e92bd93f59b4ec75e9a9926 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 22:24:17 -0500 Subject: [PATCH 09/24] Exposing convert_array --- cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh | 5 +++++ cpp/include/raft/linalg/reduce_rows_by_key.cuh | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index 680c95f1f4..aa0b1545d3 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -40,6 +40,11 @@ void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) } } +// +// Small helper function to convert from int->char and char->int +// Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars +// + template void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) { diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index b97a25dbac..5b34f5a6ec 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -20,6 +20,17 @@ namespace raft { namespace linalg { + +/** + Small helper function to convert from int->char and char->int + Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars +**/ +template +void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) +{ + detail::convert_array(dst, src, st); +} + /** * @brief Computes the weighted reduction of matrix rows for each given key * From 34f7bf3a33b5e8d9764304c6e170a9416024cae6 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 9 Feb 2022 23:09:36 -0500 Subject: [PATCH 10/24] Oops --- cpp/include/raft/linalg/reduce_rows_by_key.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index 5b34f5a6ec..76d4ed4971 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -28,7 +28,7 @@ namespace linalg { template void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) { - detail::convert_array(dst, src, st); + detail::convert_array(dst, src, n, st); } /** From 68639a296836430106014b06c1cb025c927a48a0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 09:59:30 -0500 Subject: [PATCH 11/24] Adding proper doxygen docs to lstsq --- cpp/include/raft/linalg/lstsq.hpp | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp index cdf67e422d..57dd0a7b15 100644 --- a/cpp/include/raft/linalg/lstsq.hpp +++ b/cpp/include/raft/linalg/lstsq.hpp @@ -24,8 +24,15 @@ namespace linalg { /** Solves the linear ordinary least squares problem `Aw = b` * Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine. * - * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, - * so it's not guaranteed to stay unmodified. + * @param[in] handle raft handle + * @param[inout] A input feature matrix. + * Warning: the content of this matrix is modified by the cuSOLVER routines. + * @param[in] n_rows number of rows in A + * @param[in] n_cols number of columns in A + * @param[inout] b input target vector. + * Warning: the content of this vector is modified by the cuSOLVER routines. + * @param[out] w output coefficient vector + * @param[in] stream cuda stream for ordering operations */ template void lstsqSvdQR(const raft::handle_t& handle, @@ -42,8 +49,15 @@ void lstsqSvdQR(const raft::handle_t& handle, /** Solves the linear ordinary least squares problem `Aw = b` * Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER). * - * @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines, - * so it's not guaranteed to stay unmodified. + * @param[in] handle raft handle + * @param[inout] A input feature matrix. + * Warning: the content of this matrix is modified by the cuSOLVER routines. + * @param[in] n_rows number of rows in A + * @param[in] n_cols number of columns in A + * @param[inout] b input target vector. + * Warning: the content of this vector is modified by the cuSOLVER routines. + * @param[out] w output coefficient vector + * @param[in] stream cuda stream for ordering operations */ template void lstsqSvdJacobi(const raft::handle_t& handle, @@ -77,10 +91,15 @@ void lstsqEig(const raft::handle_t& handle, * via QR decomposition of `A = QR`. * (triangular system of equations `Rw = Q^T b`) * - * @param A[in/out] - input feature matrix. + * @param[in] handle raft handle + * @param[inout] A input feature matrix. * Warning: the content of this matrix is modified by the cuSOLVER routines. - * @param b[in/out] - input target vector. + * @param[in] n_rows number of rows in A + * @param[in] n_cols number of columns in A + * @param[inout] b input target vector. * Warning: the content of this vector is modified by the cuSOLVER routines. + * @param[out] w output coefficient vector + * @param[in] stream cuda stream for ordering operations */ template void lstsqQR(const raft::handle_t& handle, From 34b54f44608eae572a88c974f1dad5a7d94b5240 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 16:35:08 -0500 Subject: [PATCH 12/24] Updating gtests --- cpp/test/linalg/power.cu | 16 ++++++++++++---- cpp/test/linalg/sqrt.cu | 13 ++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu index 8c862bbeab..0ec8613ce7 100644 --- a/cpp/test/linalg/power.cu +++ b/cpp/test/linalg/power.cu @@ -71,14 +71,21 @@ template template class PowerTest : public ::testing::TestWithParam> { protected: - PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {} + PowerTest() + : in1(0, handle.get_stream()), + in2(0, handle.get_stream()), + out_ref(0, handle.get_stream()), + out(0, handle.get_stream()) + { + } void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + + cudaStream_t stream = handle.get_stream(); in1.resize(len, stream); in2.resize(len, stream); @@ -94,11 +101,12 @@ class PowerTest : public ::testing::TestWithParam> { powerScalar(out.data(), out.data(), T(2), len, stream); power(in1.data(), in1.data(), in2.data(), len, stream); powerScalar(in1.data(), in1.data(), T(2), len, stream); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + + handle.sync_stream(); } protected: - cudaStream_t stream = 0; + raft::handle_t handle; PowerInputs params; rmm::device_uvector in1, in2, out_ref, out; int device_count = 0; diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu index 6aa6376c26..92c9626395 100644 --- a/cpp/test/linalg/sqrt.cu +++ b/cpp/test/linalg/sqrt.cu @@ -55,13 +55,16 @@ template template class SqrtTest : public ::testing::TestWithParam> { protected: - SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {} + SqrtTest() + : in1(0, handle.get_stream()), out_ref(0, handle.get_stream()), out(0, handle.get_stream()) + { + } void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); + auto stream = handle.get_stream(); + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); int len = params.len; in1.resize(len, stream); out_ref.resize(len, stream); @@ -72,11 +75,11 @@ class SqrtTest : public ::testing::TestWithParam> { sqrt(out.data(), in1.data(), len, stream); sqrt(in1.data(), in1.data(), len, stream); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } protected: - cudaStream_t stream = 0; + raft::handle_t handle; SqrtInputs params; rmm::device_uvector in1, out_ref, out; int device_count = 0; From 587c0f10e48ffde39e89934370208aa4bbc2993d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 17:04:38 -0500 Subject: [PATCH 13/24] Moving remaining stats stuff over --- cpp/include/raft/common/seive.cuh | 125 +++++ cpp/include/raft/stats/cov.hpp | 58 ++ cpp/include/raft/stats/detail/cov.cuh | 95 ++++ cpp/include/raft/stats/detail/histogram.cuh | 529 ++++++++++++++++++ cpp/include/raft/stats/detail/minmax.cuh | 247 ++++++++ .../raft/stats/detail/weighted_mean.cuh | 94 ++++ cpp/include/raft/stats/histogram.hpp | 60 ++ cpp/include/raft/stats/minmax.hpp | 70 +++ cpp/include/raft/stats/weighted_mean.hpp | 60 ++ cpp/test/CMakeLists.txt | 5 + cpp/test/common/seive.cu | 35 ++ cpp/test/stats/cov.cu | 185 ++++++ cpp/test/stats/histogram.cu | 262 +++++++++ cpp/test/stats/minmax.cu | 202 +++++++ cpp/test/stats/weighted_mean.cu | 231 ++++++++ 15 files changed, 2258 insertions(+) create mode 100644 cpp/include/raft/common/seive.cuh create mode 100644 cpp/include/raft/stats/cov.hpp create mode 100644 cpp/include/raft/stats/detail/cov.cuh create mode 100644 cpp/include/raft/stats/detail/histogram.cuh create mode 100644 cpp/include/raft/stats/detail/minmax.cuh create mode 100644 cpp/include/raft/stats/detail/weighted_mean.cuh create mode 100644 cpp/include/raft/stats/histogram.hpp create mode 100644 cpp/include/raft/stats/minmax.hpp create mode 100644 cpp/include/raft/stats/weighted_mean.hpp create mode 100644 cpp/test/common/seive.cu create mode 100644 cpp/test/stats/cov.cu create mode 100644 cpp/test/stats/histogram.cu create mode 100644 cpp/test/stats/minmax.cu create mode 100644 cpp/test/stats/weighted_mean.cu diff --git a/cpp/include/raft/common/seive.cuh b/cpp/include/raft/common/seive.cuh new file mode 100644 index 0000000000..580d9d91cb --- /dev/null +++ b/cpp/include/raft/common/seive.cuh @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +// Taken from: +// https://github.com/teju85/programming/blob/master/euler/include/seive.h + +namespace raft { +namespace common { + +/** + * @brief Implementation of 'Seive of Eratosthenes' + */ +class Seive { + public: + /** + * @param _num number of integers for which seive is needed + */ + Seive(unsigned _num) + { + N = _num; + generateSeive(); + } + + /** + * @brief Check whether a number is prime or not + * @param num number to be checked + * @return true if the 'num' is prime, else false + */ + bool isPrime(unsigned num) const + { + unsigned mask, pos; + if (num <= 1) { return false; } + if (num == 2) { return true; } + if (!(num & 1)) { return false; } + getMaskPos(num, mask, pos); + return (seive[pos] & mask); + } + + private: + void generateSeive() + { + auto sqN = fastIntSqrt(N); + auto size = raft::ceildiv(N, sizeof(unsigned) * 8); + seive.resize(size); + // assume all to be primes initially + for (auto& itr : seive) { + itr = 0xffffffffu; + } + unsigned cid = 0; + unsigned cnum = getNum(cid); + while (cnum <= sqN) { + do { + ++cid; + cnum = getNum(cid); + if (isPrime(cnum)) { break; } + } while (cnum <= sqN); + auto cnum2 = cnum << 1; + // 'unmark' all the 'odd' multiples of the current prime + for (unsigned i = 3, num = i * cnum; num <= N; i += 2, num += cnum2) { + unmark(num); + } + } + } + + unsigned getId(unsigned num) const { return (num >> 1); } + + unsigned getNum(unsigned id) const + { + if (id == 0) { return 2; } + return ((id << 1) + 1); + } + + void getMaskPos(unsigned num, unsigned& mask, unsigned& pos) const + { + pos = getId(num); + mask = 1 << (pos & 0x1f); + pos >>= 5; + } + + void unmark(unsigned num) + { + unsigned mask, pos; + getMaskPos(num, mask, pos); + seive[pos] &= ~mask; + } + + // REF: http://www.azillionmonkeys.com/qed/ulerysqroot.pdf + unsigned fastIntSqrt(unsigned val) + { + unsigned g = 0; + auto bshft = 15u, b = 1u << bshft; + do { + unsigned temp = ((g << 1) + b) << bshft--; + if (val >= temp) { + g += b; + val -= temp; + } + } while (b >>= 1); + return g; + } + + /** find all primes till this number */ + unsigned N; + /** the seive */ + std::vector seive; +}; +}; // namespace common +}; // namespace raft diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp new file mode 100644 index 0000000000..dc5bc63ee8 --- /dev/null +++ b/cpp/include/raft/stats/cov.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +namespace raft { +namespace stats { +/** + * @brief Compute covariance of the input matrix + * + * Mean operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @param covar the output covariance matrix + * @param data the input matrix (this will get mean-centered at the end!) + * @param mu mean vector of the input matrix + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample covariance or not. In other words, + * whether to normalize the output using N-1 or N, for true or false, + * respectively + * @param rowMajor whether the input data is row or col major + * @param stable whether to run the slower-but-numerically-stable version or not + * @param handle cublas handle + * @param stream cuda stream + * @note if stable=true, then the input data will be mean centered after this + * function returns! + */ +template +void cov(const raft::handle_t& handle, + Type* covar, + Type* data, + const Type* mu, + std::size_t D, + std::size_t N, + bool sample, + bool rowMajor, + bool stable, + cudaStream_t stream) +{ + detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream); +} +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh new file mode 100644 index 0000000000..7e3fc701a1 --- /dev/null +++ b/cpp/include/raft/stats/detail/cov.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace stats { +namespace detail { +/** + * @brief Compute covariance of the input matrix + * + * Mean operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @param covar the output covariance matrix + * @param data the input matrix (this will get mean-centered at the end!) + * @param mu mean vector of the input matrix + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample covariance or not. In other words, + * whether to normalize the output using N-1 or N, for true or false, + * respectively + * @param rowMajor whether the input data is row or col major + * @param stable whether to run the slower-but-numerically-stable version or not + * @param handle cublas handle + * @param stream cuda stream + * @note if stable=true, then the input data will be mean centered after this + * function returns! + */ +template +void cov(const raft::handle_t& handle, + Type* covar, + Type* data, + const Type* mu, + std::size_t D, + std::size_t N, + bool sample, + bool rowMajor, + bool stable, + cudaStream_t stream) +{ + if (stable) { + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + // since mean operation is assumed to be along a given column, broadcast + // must be along rows! + raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream); + Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N)); + Type beta = Type(0); + if (rowMajor) { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_T, + D, + D, + N, + &alpha, + data, + D, + data, + D, + &beta, + covar, + D, + stream)); + } else { + raft::linalg::gemm( + handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); + } + } else { + ///@todo: implement this using cutlass + customized epilogue! + ASSERT(false, "cov: Implement stable=false case!"); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh new file mode 100644 index 0000000000..8c69ba1459 --- /dev/null +++ b/cpp/include/raft/stats/detail/histogram.cuh @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +// This file is a shameless amalgamation of independent works done by +// Lars Nyland and Andy Adinets + +///@todo: add cub's histogram as another option + +namespace raft { +namespace stats { +namespace detail { + +/** Default mapper which just returns the value of the data itself */ +template +struct IdentityBinner { + DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } +}; + +/** Types of support histogram implementations */ +enum HistType { + /** shared mem atomics but with bins to be 1b int's */ + HistTypeSmemBits1 = 1, + /** shared mem atomics but with bins to be 2b int's */ + HistTypeSmemBits2 = 2, + /** shared mem atomics but with bins to be 4b int's */ + HistTypeSmemBits4 = 4, + /** shared mem atomics but with bins to ba 1B int's */ + HistTypeSmemBits8 = 8, + /** shared mem atomics but with bins to be 2B int's */ + HistTypeSmemBits16 = 16, + /** use only global atomics */ + HistTypeGmem, + /** uses shared mem atomics to reduce global traffic */ + HistTypeSmem, + /** + * uses shared mem atomics with match_any intrinsic to further reduce shared + * memory traffic. This can only be enabled on Volta and later architectures. + * If one tries to enable this for older arch's, it will fall back to + * `HistTypeSmem`. + * @note This is to be used only when the input dataset leads to a lot of + * repetitions in a given warp, else, this algo can be much slower than + * `HistTypeSmem`! + */ + HistTypeSmemMatchAny, + /** builds a hashmap of active bins in shared mem */ + HistTypeSmemHash, + /** decide at runtime the best algo for the given inputs */ + HistTypeAuto +}; + +static const int ThreadsPerBlock = 256; + +template +dim3 computeGridDim(IdxT nrows, IdxT ncols, const void* kernel) +{ + int occupancy; + RAFT_CUDA_TRY( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, ThreadsPerBlock, 0)); + const auto maxBlks = occupancy * raft::getMultiProcessorCount(); + int nblksx = raft::ceildiv(VecLen ? nrows / VecLen : nrows, ThreadsPerBlock); + // for cases when there aren't a lot of blocks for computing one histogram + nblksx = std::min(nblksx, maxBlks); + return dim3(nblksx, ncols); +} + +template +DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, CoreOp op, IdxT col) +{ + IdxT offset = col * nrows; + auto bdim = IdxT(blockDim.x); + IdxT tid = threadIdx.x + bdim * blockIdx.x; + tid *= VecLen; + IdxT stride = bdim * gridDim.x * VecLen; + int nCeil = raft::alignTo(nrows, stride); + typedef raft::TxN_t VecType; + VecType a; + for (auto i = tid; i < nCeil; i += stride) { + if (i < nrows) { a.load(data, offset + i); } +#pragma unroll + for (int j = 0; j < VecLen; ++j) { + int binId = binner(a.val.data[j], i + j, col); + op(binId, i + j, col); + } + } +} + +template +__global__ void gmemHistKernel( + int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) +{ + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + if (row >= nrows) return; + auto binOffset = col * nbins; +#if __CUDA_ARCH__ < 700 + raft::myAtomicAdd(bins + binOffset + binId, 1); +#else + auto amask = __activemask(); + auto mask = __match_any_sync(amask, binId); + auto leader = __ffs(mask) - 1; + if (raft::laneId() == leader) { raft::myAtomicAdd(bins + binOffset + binId, __popc(mask)); } +#endif // __CUDA_ARCH__ + }; + histCoreOp(data, nrows, nbins, binner, op, blockIdx.y); +} + +template +void gmemHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + auto blks = computeGridDim( + nrows, ncols, (const void*)gmemHistKernel); + gmemHistKernel + <<>>(bins, data, nrows, nbins, binner); +} + +template +__global__ void smemHistKernel( + int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) +{ + extern __shared__ unsigned sbins[]; + for (auto i = threadIdx.x; i < nbins; i += blockDim.x) { + sbins[i] = 0; + } + __syncthreads(); + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + if (row >= nrows) return; +#if __CUDA_ARCH__ < 700 + raft::myAtomicAdd(sbins + binId, 1); +#else + if (UseMatchAny) { + auto amask = __activemask(); + auto mask = __match_any_sync(amask, binId); + auto leader = __ffs(mask) - 1; + if (raft::laneId() == leader) { + raft::myAtomicAdd(sbins + binId, __popc(mask)); + } + } else { + raft::myAtomicAdd(sbins + binId, 1); + } +#endif // __CUDA_ARCH__ + }; + IdxT col = blockIdx.y; + histCoreOp(data, nrows, nbins, binner, op, col); + __syncthreads(); + auto binOffset = col * nbins; + for (auto i = threadIdx.x; i < nbins; i += blockDim.x) { + auto val = sbins[i]; + if (val > 0) { raft::myAtomicAdd((unsigned int*)bins + binOffset + i, val); } + } +} + +template +void smemHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + auto blks = computeGridDim( + nrows, ncols, (const void*)smemHistKernel); + size_t smemSize = nbins * sizeof(unsigned); + smemHistKernel + <<>>(bins, data, nrows, nbins, binner); +} + +template +struct BitsInfo { + static unsigned const BIN_BITS = _BIN_BITS; + static unsigned const WORD_BITS = sizeof(unsigned) * 8; + static unsigned const WORD_BINS = WORD_BITS / BIN_BITS; + static unsigned const BIN_MASK = (1 << BIN_BITS) - 1; +}; + +template +DI void incrementBin(unsigned* sbins, int* bins, int nbins, int binId) +{ + typedef BitsInfo Bits; + auto iword = binId / Bits::WORD_BINS; + auto ibin = binId % Bits::WORD_BINS; + auto sh = ibin * Bits::BIN_BITS; + auto old_word = atomicAdd(sbins + iword, unsigned(1 << sh)); + auto new_word = old_word + unsigned(1 << sh); + if ((new_word >> sh & Bits::BIN_MASK) != 0) return; + // overflow + raft::myAtomicAdd((unsigned int*)bins + binId, Bits::BIN_MASK + 1); + for (int dbin = 1; ibin + dbin < Bits::WORD_BINS && binId + dbin < nbins; ++dbin) { + auto sh1 = (ibin + dbin) * Bits::BIN_BITS; + if ((new_word >> sh1 & Bits::BIN_MASK) == 0) { + // overflow + raft::myAtomicAdd((unsigned int*)bins + binId + dbin, Bits::BIN_MASK); + } else { + // correction + raft::myAtomicAdd(bins + binId + dbin, -1); + break; + } + } +} + +template <> +DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId) +{ + typedef BitsInfo<1> Bits; + auto iword = binId / Bits::WORD_BITS; + auto sh = binId % Bits::WORD_BITS; + auto old_word = atomicXor(sbins + iword, unsigned(1 << sh)); + if ((old_word >> sh & 1) != 0) raft::myAtomicAdd(bins + binId, 2); +} + +template +__global__ void smemBitsHistKernel( + int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) +{ + extern __shared__ unsigned sbins[]; + typedef BitsInfo Bits; + auto nwords = raft::ceildiv(nbins, Bits::WORD_BINS); + for (auto j = threadIdx.x; j < nwords; j += blockDim.x) { + sbins[j] = 0; + } + __syncthreads(); + IdxT col = blockIdx.y; + IdxT binOffset = col * nbins; + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + if (row >= nrows) return; + incrementBin(sbins, bins + binOffset, (int)nbins, binId); + }; + histCoreOp(data, nrows, nbins, binner, op, col); + __syncthreads(); + for (auto j = threadIdx.x; j < (int)nbins; j += blockDim.x) { + auto shift = j % Bits::WORD_BINS * Bits::BIN_BITS; + int count = sbins[j / Bits::WORD_BINS] >> shift & Bits::BIN_MASK; + if (count > 0) raft::myAtomicAdd(bins + binOffset + j, count); + } +} + +template +void smemBitsHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + typedef BitsInfo Bits; + auto blks = computeGridDim( + nrows, ncols, (const void*)smemBitsHistKernel); + size_t smemSize = raft::ceildiv(nbins, Bits::WORD_BITS / Bits::BIN_BITS) * sizeof(int); + smemBitsHistKernel + <<>>(bins, data, nrows, nbins, binner); +} + +#define INVALID_KEY -1 + +DI void clearHashTable(int2* ht, int hashSize) +{ + for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) { + ht[i] = {INVALID_KEY, 0}; + } +} + +DI int findEntry(int2* ht, int hashSize, int binId, int threshold) +{ + int idx = binId % hashSize; + int t; + int count = 0; + while ((t = atomicCAS(&(ht[idx].x), INVALID_KEY, binId)) != INVALID_KEY && t != binId) { + ++count; + if (count >= threshold) { + idx = INVALID_KEY; + break; + } + ++idx; + if (idx >= hashSize) { idx = 0; } + } + return idx; +} + +DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col) +{ + int binOffset = col * nbins; + for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) { + if (ht[i].x != INVALID_KEY && ht[i].y > 0) { + raft::myAtomicAdd(bins + binOffset + ht[i].x, ht[i].y); + } + ht[i] = {INVALID_KEY, 0}; + } +} + +#undef INVALID_KEY + +///@todo: honor VecLen template param +template +__global__ void smemHashHistKernel(int* bins, + const DataT* data, + IdxT nrows, + IdxT nbins, + BinnerOp binner, + int hashSize, + int threshold) +{ + extern __shared__ int2 ht[]; + int* needFlush = (int*)&(ht[hashSize]); + if (threadIdx.x == 0) { needFlush[0] = 0; } + clearHashTable(ht, hashSize); + __syncthreads(); + auto op = [=] __device__(int binId, IdxT row, IdxT col) { + bool iNeedFlush = false; + if (row < nrows) { + int hidx = findEntry(ht, hashSize, binId, threshold); + if (hidx >= 0) { + raft::myAtomicAdd(&(ht[hidx].y), 1); + } else { + needFlush[0] = 1; + iNeedFlush = true; + } + } + __syncthreads(); + if (needFlush[0]) { + flushHashTable(ht, hashSize, bins, nbins, col); + __syncthreads(); + if (threadIdx.x == 0) { needFlush[0] = 0; } + __syncthreads(); + } + if (iNeedFlush) { + int hidx = findEntry(ht, hashSize, binId, threshold); + // all threads are bound to get one valid entry as all threads in this + // block will make forward progress due to the __syncthreads call in the + // subsequent iteration + raft::myAtomicAdd(&(ht[hidx].y), 1); + } + }; + IdxT col = blockIdx.y; + histCoreOp(data, nrows, nbins, binner, op, col); + __syncthreads(); + flushHashTable(ht, hashSize, bins, nbins, col); +} + +inline int computeHashTableSize() +{ + // we shouldn't have this much of shared memory available anytime soon! + static const unsigned maxBinsEverPossible = 256 * 1024; + static raft::common::Seive primes(maxBinsEverPossible); + unsigned smem = raft::getSharedMemPerBlock(); + // divide-by-2 because hash table entry stores 2 elements: idx and count + auto binsPossible = smem / sizeof(unsigned) / 2; + for (; binsPossible > 1; --binsPossible) { + if (primes.isPrime(binsPossible)) return (int)binsPossible; + } + return 1; // should not happen! +} + +template +void smemHashHist(int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + BinnerOp binner, + cudaStream_t stream) +{ + static const int flushThreshold = 10; + auto blks = computeGridDim( + nrows, ncols, (const void*)smemHashHistKernel); + int hashSize = computeHashTableSize(); + size_t smemSize = hashSize * sizeof(int2) + sizeof(int); + smemHashHistKernel<<>>( + bins, data, nrows, nbins, binner, hashSize, flushThreshold); +} + +template +void histogramVecLen(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner) +{ + RAFT_CUDA_TRY(cudaMemsetAsync(bins, 0, ncols * nbins * sizeof(int), stream)); + switch (type) { + case HistTypeGmem: + gmemHist(bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmem: + smemHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemMatchAny: + smemHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits16: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits8: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits4: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits2: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemBits1: + smemBitsHist( + bins, nbins, data, nrows, ncols, binner, stream); + break; + case HistTypeSmemHash: + smemHashHist(bins, nbins, data, nrows, ncols, binner, stream); + break; + default: ASSERT(false, "histogram: Invalid type passed '%d'!", type); + }; + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +void histogramImpl(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner) +{ + size_t bytes = nrows * sizeof(DataT); + if (nrows <= 0) return; + if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else if (4 % sizeof(DataT) == 0 && bytes % 4 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else if (2 % sizeof(DataT) == 0 && bytes % 2 == 0) { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } else { + histogramVecLen( + type, bins, nbins, data, nrows, ncols, stream, binner); + } +} + +template +HistType selectBestHistAlgo(IdxT nbins) +{ + size_t smem = raft::getSharedMemPerBlock(); + size_t requiredSize = nbins * sizeof(unsigned); + if (requiredSize <= smem) { return HistTypeSmem; } + for (int bits = 16; bits >= 1; bits >>= 1) { + auto nBytesForBins = raft::ceildiv(bits * nbins, 8); + requiredSize = raft::alignTo(nBytesForBins, sizeof(unsigned)); + if (requiredSize <= smem) { return static_cast(bits); } + } + return HistTypeGmem; +} + +/** + * @brief Perform histogram on the input data. It chooses the right load size + * based on the input data vector length. It also supports large-bin cases + * using a specialized smem-based hashing technique. + * @tparam DataT input data type + * @tparam IdxT data type used to compute indices + * @tparam BinnerOp takes the input data and computes its bin index + * @param type histogram implementation type to choose + * @param bins the output bins (length = ncols * nbins) + * @param nbins number of bins + * @param data input data (length = ncols * nrows) + * @param nrows data array length in each column (or batch) + * @param ncols number of columsn (or batch size) + * @param stream cuda stream + * @param binner the operation that computes the bin index of the input data + * + * @note signature of BinnerOp is `int func(DataT, IdxT);` + */ +template > +void histogram(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner = IdentityBinner()) +{ + HistType computedType = type; + if (type == HistTypeAuto) { computedType = selectBestHistAlgo(nbins); } + histogramImpl( + computedType, bins, nbins, data, nrows, ncols, stream, binner); +} + +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh new file mode 100644 index 0000000000..c2b14f1544 --- /dev/null +++ b/cpp/include/raft/stats/detail/minmax.cuh @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace raft { +namespace stats { + +namespace detail { + +// TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it +template +constexpr To + +bit_cast(const From& from) + + noexcept +{ + To to{}; + static_assert(sizeof(To) == sizeof(From)); + memcpy(&to, &from, sizeof(To)); + return to; +} + +} // namespace detail + +template +struct encode_traits { +}; + +template <> +struct encode_traits { + using E = int; +}; + +template <> +struct encode_traits { + using E = long long; +}; + +HDI int encode(float val) +{ + int i = detail::bit_cast(val); + return i >= 0 ? i : (1 << 31) | ~i; +} + +HDI long long encode(double val) +{ + std::int64_t i = detail::bit_cast(val); + return i >= 0 ? i : (1ULL << 63) | ~i; +} + +HDI float decode(int val) +{ + if (val < 0) val = (1 << 31) | ~val; + return detail::bit_cast(val); +} + +HDI double decode(long long val) +{ + if (val < 0) val = (1ULL << 63) | ~val; + return detail::bit_cast(val); +} + +template +DI T atomicMaxBits(T* address, T val) +{ + E old = atomicMax((E*)address, encode(val)); + return decode(old); +} + +template +DI T atomicMinBits(T* address, T val) +{ + E old = atomicMin((E*)address, encode(val)); + return decode(old); +} + +template +__global__ void decodeKernel(T* globalmin, T* globalmax, int ncols) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < ncols) { + globalmin[tid] = decode(*(E*)&globalmin[tid]); + globalmax[tid] = decode(*(E*)&globalmax[tid]); + } +} + +///@todo: implement a proper "fill" kernel +template +__global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= ncols) return; + *(E*)&globalmin[tid] = encode(init_val); + *(E*)&globalmax[tid] = encode(-init_val); +} + +template +__global__ void minmaxKernel(const T* data, + const unsigned int* rowids, + const unsigned int* colids, + int nrows, + int ncols, + int row_stride, + T* g_min, + T* g_max, + T* sampledcols, + T init_min_val, + int batch_ncols, + int num_batches) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ char shmem[]; + T* s_min = (T*)shmem; + T* s_max = (T*)(shmem + sizeof(T) * batch_ncols); + + int last_batch_ncols = ncols % batch_ncols; + if (last_batch_ncols == 0) { last_batch_ncols = batch_ncols; } + int orig_batch_ncols = batch_ncols; + + for (int batch_id = 0; batch_id < num_batches; batch_id++) { + if (batch_id == num_batches - 1) { batch_ncols = last_batch_ncols; } + + for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) { + *(E*)&s_min[i] = encode(init_min_val); + *(E*)&s_max[i] = encode(-init_min_val); + } + __syncthreads(); + + for (int i = tid; i < nrows * batch_ncols; i += blockDim.x * gridDim.x) { + int col = (batch_id * orig_batch_ncols) + (i / nrows); + int row = i % nrows; + if (colids != nullptr) { col = colids[col]; } + if (rowids != nullptr) { row = rowids[row]; } + int index = row + col * row_stride; + T coldata = data[index]; + if (!isnan(coldata)) { + // Min max values are saved in shared memory and global memory as per the shuffled colids. + atomicMinBits(&s_min[(int)(i / nrows)], coldata); + atomicMaxBits(&s_max[(int)(i / nrows)], coldata); + } + if (sampledcols != nullptr) { sampledcols[batch_id * orig_batch_ncols + i] = coldata; } + } + __syncthreads(); + + // finally, perform global mem atomics + for (int j = threadIdx.x; j < batch_ncols; j += blockDim.x) { + atomicMinBits(&g_min[batch_id * orig_batch_ncols + j], decode(*(E*)&s_min[j])); + atomicMaxBits(&g_max[batch_id * orig_batch_ncols + j], decode(*(E*)&s_max[j])); + } + __syncthreads(); + } +} + +/** + * @brief Computes min/max across every column of the input matrix, as well as + * optionally allow to subsample based on the given row/col ID mapping vectors + * + * @tparam T the data type + * @tparam TPB number of threads per block + * @param data input data + * @param rowids actual row ID mappings. It is of length nrows. If you want to + * skip this index lookup entirely, pass nullptr + * @param colids actual col ID mappings. It is of length ncols. If you want to + * skip this index lookup entirely, pass nullptr + * @param nrows number of rows of data to be worked upon. The actual rows of the + * input "data" can be bigger than this! + * @param ncols number of cols of data to be worked upon. The actual cols of the + * input "data" can be bigger than this! + * @param row_stride stride (in number of elements) between 2 adjacent columns + * @param globalmin final col-wise global minimum (size = ncols) + * @param globalmax final col-wise global maximum (size = ncols) + * @param sampledcols output sampled data. Pass nullptr if you don't need this + * @param stream cuda stream + * @note This method makes the following assumptions: + * 1. input and output matrices are assumed to be col-major + * 2. ncols is small enough to fit the whole of min/max values across all cols + * in shared memory + */ +template +void minmax(const T* data, + const unsigned* rowids, + const unsigned* colids, + int nrows, + int ncols, + int row_stride, + T* globalmin, + T* globalmax, + T* sampledcols, + cudaStream_t stream) +{ + using E = typename encode_traits::E; + int nblks = raft::ceildiv(ncols, TPB); + T init_val = std::numeric_limits::max(); + minmaxInitKernel<<>>(ncols, globalmin, globalmax, init_val); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + nblks = raft::ceildiv(nrows * ncols, TPB); + nblks = min(nblks, 65536); + size_t smemSize = sizeof(T) * 2 * ncols; + + // Compute the batch_ncols, in [1, ncols] range, that meet the available + // shared memory constraints. + auto smemPerBlk = raft::getSharedMemPerBlock(); + int batch_ncols = min(ncols, (int)(smemPerBlk / (sizeof(T) * 2))); + int num_batches = raft::ceildiv(ncols, batch_ncols); + smemSize = sizeof(T) * 2 * batch_ncols; + + minmaxKernel<<>>(data, + rowids, + colids, + nrows, + ncols, + row_stride, + globalmin, + globalmax, + sampledcols, + init_val, + batch_ncols, + num_batches); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + decodeKernel<<>>(globalmin, globalmax, ncols); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +}; // end namespace detail +}; // end namespace stats +} +; // end namespace raft diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh new file mode 100644 index 0000000000..ca7fc136d3 --- /dev/null +++ b/cpp/include/raft/stats/detail/weighted_mean.cuh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace stats { +namespace detail { + +/** + * @brief Compute the row-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be row-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void rowWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + // sum the weights & copy back to CPU + Type WS = 0; + raft::linalg::coalescedReduction(mu, weights, D, 1, (Type)0, stream, false); + raft::update_host(&WS, mu, 1, stream); + + raft::linalg::coalescedReduction( + mu, + data, + D, + N, + (Type)0, + stream, + false, + [weights] __device__(Type v, int i) { return v * weights[i]; }, + [] __device__(Type a, Type b) { return a + b; }, + [WS] __device__(Type v) { return v / WS; }); +} + +/** + * @brief Compute the column-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be column-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void colWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + // sum the weights & copy back to CPU + Type WS = 0; + raft::linalg::stridedReduction(mu, weights, 1, N, (Type)0, stream, false); + raft::update_host(&WS, mu, 1, stream); + + raft::linalg::stridedReduction( + mu, + data, + D, + N, + (Type)0, + stream, + false, + [weights] __device__(Type v, int i) { return v * weights[i]; }, + [] __device__(Type a, Type b) { return a + b; }, + [WS] __device__(Type v) { return v / WS; }); +} +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp new file mode 100644 index 0000000000..30e982115a --- /dev/null +++ b/cpp/include/raft/stats/histogram.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +// This file is a shameless amalgamation of independent works done by +// Lars Nyland and Andy Adinets + +///@todo: add cub's histogram as another option + +namespace raft { +namespace stats { +/** + * @brief Perform histogram on the input data. It chooses the right load size + * based on the input data vector length. It also supports large-bin cases + * using a specialized smem-based hashing technique. + * @tparam DataT input data type + * @tparam IdxT data type used to compute indices + * @tparam BinnerOp takes the input data and computes its bin index + * @param type histogram implementation type to choose + * @param bins the output bins (length = ncols * nbins) + * @param nbins number of bins + * @param data input data (length = ncols * nrows) + * @param nrows data array length in each column (or batch) + * @param ncols number of columsn (or batch size) + * @param stream cuda stream + * @param binner the operation that computes the bin index of the input data + * + * @note signature of BinnerOp is `int func(DataT, IdxT);` + */ +template > +void histogram(HistType type, + int* bins, + IdxT nbins, + const DataT* data, + IdxT nrows, + IdxT ncols, + cudaStream_t stream, + BinnerOp binner = IdentityBinner()) +{ + detail::histogram(type, bins, nbins, data, nrows, ncols, stream, binner); +} + +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp new file mode 100644 index 0000000000..966287bb41 --- /dev/null +++ b/cpp/include/raft/stats/minmax.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace raft { +namespace stats { + +/** + * @brief Computes min/max across every column of the input matrix, as well as + * optionally allow to subsample based on the given row/col ID mapping vectors + * + * @tparam T the data type + * @tparam TPB number of threads per block + * @param data input data + * @param rowids actual row ID mappings. It is of length nrows. If you want to + * skip this index lookup entirely, pass nullptr + * @param colids actual col ID mappings. It is of length ncols. If you want to + * skip this index lookup entirely, pass nullptr + * @param nrows number of rows of data to be worked upon. The actual rows of the + * input "data" can be bigger than this! + * @param ncols number of cols of data to be worked upon. The actual cols of the + * input "data" can be bigger than this! + * @param row_stride stride (in number of elements) between 2 adjacent columns + * @param globalmin final col-wise global minimum (size = ncols) + * @param globalmax final col-wise global maximum (size = ncols) + * @param sampledcols output sampled data. Pass nullptr if you don't need this + * @param stream cuda stream + * @note This method makes the following assumptions: + * 1. input and output matrices are assumed to be col-major + * 2. ncols is small enough to fit the whole of min/max values across all cols + * in shared memory + */ +template +void minmax(const T* data, + const unsigned* rowids, + const unsigned* colids, + int nrows, + int ncols, + int row_stride, + T* globalmin, + T* globalmax, + T* sampledcols, + cudaStream_t stream) +{ + detail::minmax( + data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream); +} + +}; // namespace stats +}; // namespace raft diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp new file mode 100644 index 0000000000..ad90142a08 --- /dev/null +++ b/cpp/include/raft/stats/weighted_mean.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace stats { + +/** + * @brief Compute the row-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be row-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void rowWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + detail::rowWeightedMean(mu, data, weights, D, N, stream); +} + +/** + * @brief Compute the column-wise weighted mean of the input matrix + * + * @tparam Type the data type + * @param mu the output mean vector + * @param data the input matrix (assumed to be column-major) + * @param weights per-column means + * @param D number of columns of data + * @param N number of rows of data + * @param stream cuda stream to launch work on + */ +template +void colWeightedMean( + Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream) +{ + detail::colWeightedMean(mu, data, weights, D, N, stream); +} +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 2ace88b498..cd08de629c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -16,6 +16,7 @@ # keep the files in alphabetical order! add_executable(test_raft + test/common/seive.cu test/cudart_utils.cpp test/cluster_solvers.cu test/distance/dist_adj.cu @@ -103,11 +104,15 @@ add_executable(test_raft test/spatial/faiss_mr.cu test/spatial/selection.cu test/spectral_matrix.cu + test/stats/cov.cu + test/stats/histogram.cu test/stats/mean.cu test/stats/meanvar.cu test/stats/mean_center.cu + test/stats/minmax.cu test/stats/stddev.cu test/stats/sum.cu + test/stats/weighted_mean.cu test/test.cpp ) diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu new file mode 100644 index 0000000000..ca46397b19 --- /dev/null +++ b/cpp/test/common/seive.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft { +namespace common { +TEST(Seive, Test) +{ + Seive s1(32); + ASSERT_TRUE(s1.isPrime(17)); + ASSERT_FALSE(s1.isPrime(28)); + + Seive s2(1024 * 1024); + ASSERT_TRUE(s2.isPrime(107)); + ASSERT_FALSE(s2.isPrime(111)); + ASSERT_TRUE(s2.isPrime(6047)); +} + +} // end namespace common +} // end namespace raft diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu new file mode 100644 index 0000000000..92f3101d75 --- /dev/null +++ b/cpp/test/stats/cov.cu @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +template +struct CovInputs { + T tolerance, mean, var; + int rows, cols; + bool sample, rowMajor, stable; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const CovInputs& dims) +{ + return os; +} + +template +class CovTest : public ::testing::TestWithParam> { + protected: + CovTest() + : data(0, stream), + mean_act(0, stream), + cov_act(0, stream), + cov_cm(0, stream), + cov_cm_ref(0, stream) + { + } + + void SetUp() override + { + raft::handle_t handle; + cudaStream_t stream = handle.get_stream(); + + params = ::testing::TestWithParam>::GetParam(); + params.tolerance *= 2; + raft::random::Rng r(params.seed); + int rows = params.rows, cols = params.cols; + auto len = rows * cols; + T var = params.var; + data.resize(len, stream); + mean_act.resize(cols, stream); + cov_act.resize(cols * cols, stream); + + r.normal(data.data(), len, params.mean, var, stream); + raft::stats::mean( + mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream); + cov(handle, + cov_act.data(), + data.data(), + mean_act.data(), + cols, + rows, + params.sample, + params.rowMajor, + params.stable, + stream); + + T data_h[6] = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0}; + T cov_cm_ref_h[4] = {4.3333, -2.8333, -2.8333, 2.333}; + + cov_cm.resize(4, stream); + cov_cm_ref.resize(4, stream); + rmm::device_uvector data_cm(6, stream); + rmm::device_uvector mean_cm(2, stream); + + raft::update_device(data_cm.data(), data_h, 6, stream); + raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream); + + raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream); + cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream); + } + + protected: + CovInputs params; + rmm::device_uvector data, mean_act, cov_act, cov_cm, cov_cm_ref; + cublasHandle_t handle; + cudaStream_t stream = 0; +}; + +///@todo: add stable=false after it has been implemented +const std::vector> inputsf = { + {0.03f, 1.f, 2.f, 32 * 1024, 32, true, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 64, true, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 128, true, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 256, true, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 32, false, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 64, false, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 128, false, false, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 256, false, false, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 32, true, true, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 64, true, true, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 128, true, true, true, 1234ULL}, + {0.03f, 1.f, 2.f, 32 * 1024, 256, true, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 32, false, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 64, false, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 128, false, true, true, 1234ULL}, + {0.03f, -1.f, 2.f, 32 * 1024, 256, false, true, true, 1234ULL}}; + +const std::vector> inputsd = { + {0.03, 1.0, 2.0, 32 * 1024, 32, true, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 64, true, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 128, true, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 256, true, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 32, false, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 64, false, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 128, false, false, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 256, false, false, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 32, true, true, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 64, true, true, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 128, true, true, true, 1234ULL}, + {0.03, 1.0, 2.0, 32 * 1024, 256, true, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 32, false, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 64, false, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 128, false, true, true, 1234ULL}, + {0.03, -1.0, 2.0, 32 * 1024, 256, false, true, true, 1234ULL}}; + +typedef CovTest CovTestF; +TEST_P(CovTestF, Result) +{ + ASSERT_TRUE(raft::diagonalMatch(params.var * params.var, + cov_act.data(), + params.cols, + params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef CovTest CovTestD; +TEST_P(CovTestD, Result) +{ + ASSERT_TRUE(raft::diagonalMatch(params.var * params.var, + cov_act.data(), + params.cols, + params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef CovTest CovTestSmallF; +TEST_P(CovTestSmallF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox(params.tolerance))); +} + +typedef CovTest CovTestSmallD; +TEST_P(CovTestSmallD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestD, ::testing::ValuesIn(inputsd)); + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallD, ::testing::ValuesIn(inputsd)); + +} // namespace stats +} // namespace raft diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu new file mode 100644 index 0000000000..60dc5fb909 --- /dev/null +++ b/cpp/test/stats/histogram.cu @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +// Note: this kernel also updates the input vector to take care of OOB bins! +__global__ void naiveHistKernel(int* bins, int nbins, int* in, int nrows) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + auto offset = blockIdx.y * nrows; + auto binOffset = blockIdx.y * nbins; + for (; tid < nrows; tid += stride) { + int id = in[offset + tid]; + if (id < 0) + id = 0; + else if (id >= nbins) + id = nbins - 1; + in[offset + tid] = id; + raft::myAtomicAdd(bins + binOffset + id, 1); + } +} + +void naiveHist(int* bins, int nbins, int* in, int nrows, int ncols, cudaStream_t stream) +{ + const int TPB = 128; + int nblksx = raft::ceildiv(nrows, TPB); + dim3 blks(nblksx, ncols); + naiveHistKernel<<>>(bins, nbins, in, nrows); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +struct HistInputs { + int nrows, ncols, nbins; + bool isNormal; + HistType type; + int start, end; + unsigned long long int seed; +}; + +class HistTest : public ::testing::TestWithParam { + protected: + HistTest() : in(0, stream), bins(0, stream), ref_bins(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam::GetParam(); + raft::random::Rng r(params.seed); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + int len = params.nrows * params.ncols; + in.resize(len, stream); + if (params.isNormal) { + r.normalInt(in.data(), len, params.start, params.end, stream); + } else { + r.uniformInt(in.data(), len, params.start, params.end, stream); + } + bins.resize(params.nbins * params.ncols, stream); + ref_bins.resize(params.nbins * params.ncols, stream); + RAFT_CUDA_TRY( + cudaMemsetAsync(ref_bins.data(), 0, sizeof(int) * params.nbins * params.ncols, stream)); + naiveHist(ref_bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream); + histogram( + params.type, bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream); + raft::interruptible::synchronize(stream); + } + + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + + protected: + cudaStream_t stream = 0; + HistInputs params; + rmm::device_uvector in, bins, ref_bins; +}; + +static const int oneK = 1024; +static const int oneM = oneK * oneK; +const std::vector inputs = { + {oneM, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL}, + + {oneM, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 1, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL}, + {oneM + 2, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 1, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 1, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, + {oneM + 2, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL}, + {oneM + 2, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL}, +}; +TEST_P(HistTest, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + ref_bins.data(), bins.data(), params.nbins * params.ncols, raft::Compare())); +} +INSTANTIATE_TEST_CASE_P(HistTests, HistTest, ::testing::ValuesIn(inputs)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu new file mode 100644 index 0000000000..e505f3ed00 --- /dev/null +++ b/cpp/test/stats/minmax.cu @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +///@todo: need to add tests for verifying the column subsampling feature + +template +struct MinMaxInputs { + T tolerance; + int rows, cols; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const MinMaxInputs& dims) +{ + return os; +} + +template +__global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= ncols) return; + globalmin[tid] = init_val; + globalmax[tid] = -init_val; +} + +template +__global__ void naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int col = tid / nrows; + if (col < ncols) { + T val = data[tid]; + if (!isnan(val)) { + raft::myAtomicMin(&globalmin[col], val); + raft::myAtomicMax(&globalmax[col], val); + } + } +} + +template +void naiveMinMax( + const T* data, int nrows, int ncols, T* globalmin, T* globalmax, cudaStream_t stream) +{ + const int TPB = 128; + int nblks = raft::ceildiv(ncols, TPB); + T init_val = std::numeric_limits::max(); + naiveMinMaxInitKernel<<>>(ncols, globalmin, globalmax, init_val); + RAFT_CUDA_TRY(cudaGetLastError()); + nblks = raft::ceildiv(nrows * ncols, TPB); + naiveMinMaxKernel<<>>(data, nrows, ncols, globalmin, globalmax); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +__global__ void nanKernel(T* data, const bool* mask, int len, T nan) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= len) return; + if (!mask[tid]) data[tid] = nan; +} + +template +class MinMaxTest : public ::testing::TestWithParam> { + protected: + MinMaxTest() : minmax_act(0, stream), minmax_ref(0, stream) {} + + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int len = params.rows * params.cols; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + + rmm::device_uvector data(len, stream); + rmm::device_uvector mask(len, stream); + minmax_act.resize(2 * params.cols, stream); + minmax_ref.resize(2 * params.cols, stream); + + r.normal(data.data(), len, (T)0.0, (T)1.0, stream); + T nan_prob = 0.01; + r.bernoulli(mask.data(), len, nan_prob, stream); + const int TPB = 256; + nanKernel<<>>( + data.data(), mask.data(), len, std::numeric_limits::quiet_NaN()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + naiveMinMax(data.data(), + params.rows, + params.cols, + minmax_ref.data(), + minmax_ref.data() + params.cols, + stream); + minmax(data.data(), + nullptr, + nullptr, + params.rows, + params.cols, + params.rows, + minmax_act.data(), + minmax_act.data() + params.cols, + nullptr, + stream); + } + + protected: + MinMaxInputs params; + rmm::device_uvector minmax_act; + rmm::device_uvector minmax_ref; + cudaStream_t stream = 0; +}; + +const std::vector> inputsf = {{0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}, + {0.00001f, 1024, 512, 1234ULL}, + {0.00001f, 1024, 1024, 1234ULL}, + {0.00001f, 4096, 32, 1234ULL}, + {0.00001f, 4096, 64, 1234ULL}, + {0.00001f, 4096, 128, 1234ULL}, + {0.00001f, 4096, 256, 1234ULL}, + {0.00001f, 4096, 512, 1234ULL}, + {0.00001f, 4096, 1024, 1234ULL}, + {0.00001f, 8192, 32, 1234ULL}, + {0.00001f, 8192, 64, 1234ULL}, + {0.00001f, 8192, 128, 1234ULL}, + {0.00001f, 8192, 256, 1234ULL}, + {0.00001f, 8192, 512, 1234ULL}, + {0.00001f, 8192, 1024, 1234ULL}, + {0.00001f, 1024, 8192, 1234ULL}}; + +const std::vector> inputsd = {{0.0000001, 1024, 32, 1234ULL}, + {0.0000001, 1024, 64, 1234ULL}, + {0.0000001, 1024, 128, 1234ULL}, + {0.0000001, 1024, 256, 1234ULL}, + {0.0000001, 1024, 512, 1234ULL}, + {0.0000001, 1024, 1024, 1234ULL}, + {0.0000001, 4096, 32, 1234ULL}, + {0.0000001, 4096, 64, 1234ULL}, + {0.0000001, 4096, 128, 1234ULL}, + {0.0000001, 4096, 256, 1234ULL}, + {0.0000001, 4096, 512, 1234ULL}, + {0.0000001, 4096, 1024, 1234ULL}, + {0.0000001, 8192, 32, 1234ULL}, + {0.0000001, 8192, 64, 1234ULL}, + {0.0000001, 8192, 128, 1234ULL}, + {0.0000001, 8192, 256, 1234ULL}, + {0.0000001, 8192, 512, 1234ULL}, + {0.0000001, 8192, 1024, 1234ULL}, + {0.0000001, 1024, 8192, 1234ULL}}; + +typedef MinMaxTest MinMaxTestF; +TEST_P(MinMaxTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(), + minmax_act.data(), + 2 * params.cols, + raft::CompareApprox(params.tolerance))); +} + +typedef MinMaxTest MinMaxTestD; +TEST_P(MinMaxTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(), + minmax_act.data(), + 2 * params.cols, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestF, ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestD, ::testing::ValuesIn(inputsd)); + +} // end namespace stats +} // end namespace raft diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu new file mode 100644 index 0000000000..b3502bc5bc --- /dev/null +++ b/cpp/test/stats/weighted_mean.cu @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace stats { + +template +struct WeightedMeanInputs { + T tolerance; + int M, N; + unsigned long long int seed; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const WeightedMeanInputs& I) +{ + return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << "}" + << std::endl; +} + +///// weighted row-wise mean test and support functions +template +void naiveRowWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor) +{ + int istr = rowMajor ? 1 : M; + int jstr = rowMajor ? N : 1; + + // sum the weights + T WS = 0; + for (int i = 0; i < N; i++) + WS += W[i]; + + for (int j = 0; j < M; j++) { + R[j] = (T)0; + for (int i = 0; i < N; i++) { + // R[j] += (W[i]*D[i*istr + j*jstr] - R[j])/(T)(i+1); + R[j] += (W[i] * D[i * istr + j * jstr]) / WS; + } + } +} + +template +class RowWeightedMeanTest : public ::testing::TestWithParam> { + protected: + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.M, cols = params.N, len = rows * cols; + cudaStream_t stream = 0; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + // device-side data + din.resize(len); + dweights.resize(cols); + dexp.resize(rows); + dact.resize(rows); + + // create random matrix and weights + r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream); + r.uniform(dweights.data().get(), cols, T(-1.0), T(1.0), stream); + + // host-side data + thrust::host_vector hin = din; + thrust::host_vector hweights = dweights; + thrust::host_vector hexp(rows); + + // compute naive result & copy to GPU + naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true); + dexp = hexp; + + // compute ml-prims result + rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream); + + // adjust tolerance to account for round-off accumulation + params.tolerance *= params.N; + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + void TearDown() override {} + + protected: + WeightedMeanInputs params; + thrust::host_vector hin, hweights; + thrust::device_vector din, dweights, dexp, dact; +}; + +///// weighted column-wise mean test and support functions +template +void naiveColWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor) +{ + int istr = rowMajor ? 1 : M; + int jstr = rowMajor ? N : 1; + + // sum the weights + T WS = 0; + for (int j = 0; j < M; j++) + WS += W[j]; + + for (int i = 0; i < N; i++) { + R[i] = (T)0; + for (int j = 0; j < M; j++) { + // R[i] += (W[j]*D[i*istr + j*jstr] - R[i])/(T)(j+1); + R[i] += (W[j] * D[i * istr + j * jstr]) / WS; + } + } +} + +template +class ColWeightedMeanTest : public ::testing::TestWithParam> { + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + raft::random::Rng r(params.seed); + int rows = params.M, cols = params.N, len = rows * cols; + + cudaStream_t stream = 0; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + // device-side data + din.resize(len); + dweights.resize(rows); + dexp.resize(cols); + dact.resize(cols); + + // create random matrix and weights + r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream); + r.uniform(dweights.data().get(), rows, T(-1.0), T(1.0), stream); + + // host-side data + thrust::host_vector hin = din; + thrust::host_vector hweights = dweights; + thrust::host_vector hexp(cols); + + // compute naive result & copy to GPU + naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true); + dexp = hexp; + + // compute ml-prims result + colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream); + + // adjust tolerance to account for round-off accumulation + params.tolerance *= params.M; + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); + } + + void TearDown() override {} + + protected: + WeightedMeanInputs params; + thrust::host_vector hin, hweights; + thrust::device_vector din, dweights, dexp, dact; +}; + +////// Parameter sets and test instantiation +static const float tolF = 128 * std::numeric_limits::epsilon(); +static const double tolD = 256 * std::numeric_limits::epsilon(); + +const std::vector> inputsf = {{tolF, 4, 4, 1234}, + {tolF, 1024, 32, 1234}, + {tolF, 1024, 64, 1234}, + {tolF, 1024, 128, 1234}, + {tolF, 1024, 256, 1234}, + {tolF, 1024, 32, 1234}, + {tolF, 1024, 64, 1234}, + {tolF, 1024, 128, 1234}, + {tolF, 1024, 256, 1234}}; + +const std::vector> inputsd = {{tolD, 4, 4, 1234}, + {tolD, 1024, 32, 1234}, + {tolD, 1024, 64, 1234}, + {tolD, 1024, 128, 1234}, + {tolD, 1024, 256, 1234}, + {tolD, 1024, 32, 1234}, + {tolD, 1024, 64, 1234}, + {tolD, 1024, 128, 1234}, + {tolD, 1024, 256, 1234}}; + +using RowWeightedMeanTestF = RowWeightedMeanTest; +TEST_P(RowWeightedMeanTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestF, ::testing::ValuesIn(inputsf)); + +using RowWeightedMeanTestD = RowWeightedMeanTest; +TEST_P(RowWeightedMeanTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestD, ::testing::ValuesIn(inputsd)); + +using ColWeightedMeanTestF = ColWeightedMeanTest; +TEST_P(ColWeightedMeanTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestF, ::testing::ValuesIn(inputsf)); + +using ColWeightedMeanTestD = ColWeightedMeanTest; +TEST_P(ColWeightedMeanTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::ValuesIn(inputsd)); + +}; // end namespace stats +}; // end namespace raft From 3297a3dc5c6958e9f1141b1adc03ce31e9b982a7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 17:07:34 -0500 Subject: [PATCH 14/24] Re-routing includes --- cpp/test/stats/cov.cu | 2 +- cpp/test/stats/histogram.cu | 2 +- cpp/test/stats/minmax.cu | 2 +- cpp/test/stats/weighted_mean.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index 92f3101d75..02e01deec3 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu index 60dc5fb909..b89e2ab208 100644 --- a/cpp/test/stats/histogram.cu +++ b/cpp/test/stats/histogram.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu index e505f3ed00..777ac800e9 100644 --- a/cpp/test/stats/minmax.cu +++ b/cpp/test/stats/minmax.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu index b3502bc5bc..df77a19d73 100644 --- a/cpp/test/stats/weighted_mean.cu +++ b/cpp/test/stats/weighted_mean.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include From 510188d56531dc753def075c10cde9fc9565002c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 17:28:47 -0500 Subject: [PATCH 15/24] Moving epsilon neighborhood. Need data generators for tests --- cpp/include/raft/device_utils.cuh | 108 ++++++++ .../knn/detail/epsilon_neighborhood.cuh | 233 ++++++++++++++++++ .../raft/spatial/knn/epsilon_neighborhood.hpp | 56 +++++ cpp/test/spatial/epsilon_neighborhood.cu | 140 +++++++++++ 4 files changed, 537 insertions(+) create mode 100644 cpp/include/raft/device_utils.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp create mode 100644 cpp/test/spatial/epsilon_neighborhood.cu diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh new file mode 100644 index 0000000000..5674e2f1c2 --- /dev/null +++ b/cpp/include/raft/device_utils.cuh @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include // pair + +namespace raft { + +// TODO move to raft https://github.com/rapidsai/raft/issues/90 +/** helper method to get the compute capability version numbers */ + inline std::pair getDeviceCapability() + { + int devId; + RAFT_CUDA_TRY(cudaGetDevice(&devId)); + int major, minor; + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId)); + return std::make_pair(major, minor); + } + +/** + * @brief Batched warp-level sum reduction + * + * @tparam T data type + * @tparam NThreads Number of threads in the warp doing independent reductions + * + * @param[in] val input value + * @return for the first "group" of threads, the reduced value. All + * others will contain unusable values! + * + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block and also doesn't support this kind of + * batched reduction operation + * @note All threads in the warp must enter this function together + * + * @todo Expand this to support arbitrary reduction ops + */ + template + DI T batchedWarpReduce(T val) +{ +#pragma unroll + for (int i = NThreads; i < raft::WarpSize; i <<= 1) { + val += raft::shfl(val, raft::laneId() + i); +} +return val; +} + +/** + * @brief 1-D block-level batched sum reduction + * + * @tparam T data type + * @tparam NThreads Number of threads in the warp doing independent reductions + * + * @param val input value + * @param smem shared memory region needed for storing intermediate results. It + * must alteast be of size: `sizeof(T) * nWarps * NThreads` + * @return for the first "group" of threads in the block, the reduced value. + * All others will contain unusable values! + * + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block and also doesn't support this kind of + * batched reduction operation + * @note All threads in the block must enter this function together + * + * @todo Expand this to support arbitrary reduction ops + */ +template +DI T batchedBlockReduce(T val, char* smem) +{ +auto* sTemp = reinterpret_cast(smem); +constexpr int nGroupsPerWarp = raft::WarpSize / NThreads; +static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!"); +const int nGroups = (blockDim.x + NThreads - 1) / NThreads; +const int lid = raft::laneId(); +const int lgid = lid % NThreads; +const int gid = threadIdx.x / NThreads; +const auto wrIdx = (gid / nGroupsPerWarp) * NThreads + lgid; +const auto rdIdx = gid * NThreads + lgid; +for (int i = nGroups; i > 0;) { +auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp; +if (gid < iAligned) { +val = batchedWarpReduce(val); +if (lid < NThreads) sTemp[wrIdx] = val; +} +__syncthreads(); +i /= nGroupsPerWarp; +if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); } +__syncthreads(); +} +return val; +} + +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh new file mode 100644 index 0000000000..4158ac3179 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { + namespace spatial { + namespace knn { + namespace detail { + + template> + struct EpsUnexpL2SqNeighborhood : public BaseClass { + private: + typedef Policy P; + + bool *adj; + DataT eps; + IdxT *vd; + + char *smem; // for final reductions + + DataT acc[P::AccRowsPerTh][P::AccColsPerTh]; + + public: + DI EpsUnexpL2SqNeighborhood(bool *_adj, + IdxT *_vd, + const DataT *_x, + const DataT *_y, + IdxT _m, + IdxT _n, + IdxT _k, + DataT _eps, + char *_smem) + : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem) { + } + + DI void run() { + prolog(); + loop(); + epilog(); + } + + private: + DI void prolog() { + this->ldgXY(0); +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + acc[i][j] = BaseClass::Zero; + } + } + this->stsXY(); + __syncthreads(); + this->pageWr ^= 1; + } + + DI void loop() { + for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { + this->ldgXY(kidx); + accumulate(); // on the previous k-block + this->stsXY(); + __syncthreads(); + this->pageWr ^= 1; + this->pageRd ^= 1; + } + accumulate(); // last iteration + } + + DI void epilog() { + IdxT startx = blockIdx.x * P::Mblk + this->accrowid; + IdxT starty = blockIdx.y * P::Nblk + this->acccolid; + auto lid = raft::laneId(); + IdxT sums[P::AccColsPerTh]; +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + sums[j] = 0; + } +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { + auto xid = startx + i * P::AccThRows; +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + auto yid = starty + j * P::AccThCols; + auto is_neigh = acc[i][j] <= eps; + ///@todo: fix uncoalesced writes using shared mem + if (xid < this->m && yid < this->n) { + adj[xid * this->n + yid] = is_neigh; + sums[j] += is_neigh; + } + } + } + // perform reduction of adjacency values to compute vertex degrees + if (vd != nullptr) { updateVertexDegree(sums); } + } + + DI void accumulate() { +#pragma unroll + for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { + this->ldsXY(ki); +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { +#pragma unroll + for (int v = 0; v < P::Veclen; ++v) { + auto diff = this->regx[i][v] - this->regy[j][v]; + acc[i][j] += diff * diff; + } + } + } + } + } + + DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh]) { + __syncthreads(); // so that we can safely reuse smem + int gid = threadIdx.x / P::AccThCols; + int lid = threadIdx.x % P::AccThCols; + auto cidx = IdxT(blockIdx.y) * P::Nblk + lid; + IdxT totalSum = 0; + // update the individual vertex degrees +#pragma unroll + for (int i = 0; i < P::AccColsPerTh; ++i) { + sums[i] = batchedBlockReduce(sums[i], smem); + auto cid = cidx + i * P::AccThCols; + if (gid == 0 && cid < this->n) { + atomicUpdate(cid, sums[i]); + totalSum += sums[i]; + } + __syncthreads(); // for safe smem reuse + } + // update the total edge count + totalSum = raft::blockReduce(totalSum, smem); + if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); } + } + + DI void atomicUpdate(IdxT addrId, IdxT val) { + if (sizeof(IdxT) == 4) { + raft::myAtomicAdd((unsigned *) (vd + addrId), val); + } else if (sizeof(IdxT) == 8) { + raft::myAtomicAdd((unsigned long long *) (vd + addrId), val); + } + } + }; // struct EpsUnexpL2SqNeighborhood + + template + __global__ __launch_bounds__(Policy::Nthreads, + 2) + + void epsUnexpL2SqNeighKernel( + bool *adj, IdxT *vd, const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, DataT eps) { + extern __shared__ char smem[]; + EpsUnexpL2SqNeighborhood obj(adj, vd, x, y, m, n, k, eps, smem); + obj.run(); + } + + template + void epsUnexpL2SqNeighImpl(bool *adj, + IdxT *vd, + const DataT *x, + const DataT *y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy Policy; + dim3 grid(raft::ceildiv(m, Policy::Mblk), raft::ceildiv(n, Policy::Nblk)); + dim3 blk(Policy::Nthreads); + epsUnexpL2SqNeighKernel < DataT, IdxT, Policy > + <<>>(adj, vd, x, y, m, n, k, eps); + RAFT_CUDA_TRY(cudaGetLastError()); + } + +/** + * @brief Computes epsilon neighborhood for the L2-Squared distance metric + * + * @tparam DataT IO and math type + * @tparam IdxT Index type + * + * @param[out] adj adjacency matrix [row-major] [on device] [dim = m x n] + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] x first matrix [row-major] [on device] [dim = m x k] + * @param[in] y second matrix [row-major] [on device] [dim = n x k] + * @param[in] eps defines epsilon neighborhood radius (should be passed as + * squared as we compute L2-squared distance in this method) + * @param[in] fop device lambda to do any other custom functions + * @param[in] stream cuda stream + */ + template + void epsUnexpL2SqNeighborhood(bool *adj, + IdxT *vd, + const DataT *x, + const DataT *y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) { + size_t bytes = sizeof(DataT) * k; + if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } else { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } + } + } // namespace detail + } // namespace knn + } // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp new file mode 100644 index 0000000000..6ef95dc010 --- /dev/null +++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace spatial { +namespace knn { + +/** + * @brief Computes epsilon neighborhood for the L2-Squared distance metric + * + * @tparam DataT IO and math type + * @tparam IdxT Index type + * + * @param[out] adj adjacency matrix [row-major] [on device] [dim = m x n] + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] x first matrix [row-major] [on device] [dim = m x k] + * @param[in] y second matrix [row-major] [on device] [dim = n x k] + * @param[in] eps defines epsilon neighborhood radius (should be passed as + * squared as we compute L2-squared distance in this method) + * @param[in] fop device lambda to do any other custom functions + * @param[in] stream cuda stream + */ +template +void epsUnexpL2SqNeighborhood(bool *adj, + IdxT *vd, + const DataT *x, + const DataT *y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) { + detail::epsUnexpL2SqNeighborhood(adj, vd, x, y, m, n, k, eps, stream); +} +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu new file mode 100644 index 0000000000..be62cf0208 --- /dev/null +++ b/cpp/test/spatial/epsilon_neighborhood.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace spatial { +namespace knn { + template + struct EpsInputs { + IdxT n_row, n_col, n_centers, n_batches; + T eps; + }; + + template + ::std::ostream &operator<<(::std::ostream &os, const EpsInputs &p) { + return os; + } + + template + class EpsNeighTest : public ::testing::TestWithParam> { + protected: + EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {} + + void SetUp() override { + param = ::testing::TestWithParam>::GetParam(); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + data.resize(param.n_row * param.n_col, stream); + labels.resize(param.n_row, stream); + batchSize = param.n_row / param.n_batches; + adj.resize(param.n_row * batchSize, stream); + vd.resize(batchSize + 1, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream)); + Random::make_blobs(data.data(), + labels.data(), + param.n_row, + param.n_col, + param.n_centers, + stream, + true, + nullptr, + nullptr, + T(0.01), + false); + } + + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + + EpsInputs param; + cudaStream_t stream = 0; + rmm::device_uvector data; + rmm::device_uvector adj; + rmm::device_uvector labels, vd; + IdxT batchSize; + }; // class EpsNeighTest + + const std::vector > inputsfi = { + {15000, 16, 5, 1, 2.f}, + {14000, 16, 5, 1, 2.f}, + {15000, 17, 5, 1, 2.f}, + {14000, 17, 5, 1, 2.f}, + {15000, 18, 5, 1, 2.f}, + {14000, 18, 5, 1, 2.f}, + {15000, 32, 5, 1, 2.f}, + {14000, 32, 5, 1, 2.f}, + {20000, 10000, 10, 1, 2.f}, + {20000, 10000, 10, 2, 2.f}, + }; + typedef EpsNeighTest EpsNeighTestFI; + TEST_P(EpsNeighTestFI, Result + ) { + for ( + int i = 0; + i(adj + . + + data(), + vd + + . + + data(), + data + + . + + data(), + data + + . + + data() + + + ( + i *batchSize + * param.n_col), + param.n_row, + batchSize, + param.n_col, + param. + eps *param + .eps, + stream); + ASSERT_TRUE(raft::devArrMatch( + param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare(), stream) + ); + } +} +INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi) +); + +}; // namespace knn +}; // namespace spatial +}; // namespace raft From 5a770e9a56b4094c6275ec445723124454574d99 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 19:37:25 -0500 Subject: [PATCH 16/24] Finishing out the move --- cpp/include/raft/device_utils.cuh | 70 ++-- .../knn/detail/epsilon_neighborhood.cuh | 374 +++++++++--------- .../raft/spatial/knn/epsilon_neighborhood.hpp | 17 +- cpp/include/raft/stats/common.hpp | 67 ++++ cpp/include/raft/stats/detail/histogram.cuh | 39 +- cpp/include/raft/stats/detail/minmax.cuh | 12 +- cpp/include/raft/stats/histogram.hpp | 2 + cpp/test/spatial/epsilon_neighborhood.cu | 2 +- cpp/test/stats/cov.cu | 2 +- cpp/test/stats/histogram.cu | 2 +- cpp/test/stats/minmax.cu | 2 +- cpp/test/stats/weighted_mean.cu | 2 +- 12 files changed, 313 insertions(+), 278 deletions(-) create mode 100644 cpp/include/raft/stats/common.hpp diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh index 5674e2f1c2..d89a484109 100644 --- a/cpp/include/raft/device_utils.cuh +++ b/cpp/include/raft/device_utils.cuh @@ -23,15 +23,15 @@ namespace raft { // TODO move to raft https://github.com/rapidsai/raft/issues/90 /** helper method to get the compute capability version numbers */ - inline std::pair getDeviceCapability() - { - int devId; - RAFT_CUDA_TRY(cudaGetDevice(&devId)); - int major, minor; - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId)); - return std::make_pair(major, minor); - } +inline std::pair getDeviceCapability() +{ + int devId; + RAFT_CUDA_TRY(cudaGetDevice(&devId)); + int major, minor; + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId)); + return std::make_pair(major, minor); +} /** * @brief Batched warp-level sum reduction @@ -50,14 +50,14 @@ namespace raft { * * @todo Expand this to support arbitrary reduction ops */ - template - DI T batchedWarpReduce(T val) +template +DI T batchedWarpReduce(T val) { #pragma unroll - for (int i = NThreads; i < raft::WarpSize; i <<= 1) { + for (int i = NThreads; i < raft::WarpSize; i <<= 1) { val += raft::shfl(val, raft::laneId() + i); -} -return val; + } + return val; } /** @@ -82,27 +82,27 @@ return val; template DI T batchedBlockReduce(T val, char* smem) { -auto* sTemp = reinterpret_cast(smem); -constexpr int nGroupsPerWarp = raft::WarpSize / NThreads; -static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!"); -const int nGroups = (blockDim.x + NThreads - 1) / NThreads; -const int lid = raft::laneId(); -const int lgid = lid % NThreads; -const int gid = threadIdx.x / NThreads; -const auto wrIdx = (gid / nGroupsPerWarp) * NThreads + lgid; -const auto rdIdx = gid * NThreads + lgid; -for (int i = nGroups; i > 0;) { -auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp; -if (gid < iAligned) { -val = batchedWarpReduce(val); -if (lid < NThreads) sTemp[wrIdx] = val; -} -__syncthreads(); -i /= nGroupsPerWarp; -if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); } -__syncthreads(); -} -return val; + auto* sTemp = reinterpret_cast(smem); + constexpr int nGroupsPerWarp = raft::WarpSize / NThreads; + static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!"); + const int nGroups = (blockDim.x + NThreads - 1) / NThreads; + const int lid = raft::laneId(); + const int lgid = lid % NThreads; + const int gid = threadIdx.x / NThreads; + const auto wrIdx = (gid / nGroupsPerWarp) * NThreads + lgid; + const auto rdIdx = gid * NThreads + lgid; + for (int i = nGroups; i > 0;) { + auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp; + if (gid < iAligned) { + val = batchedWarpReduce(val); + if (lid < NThreads) sTemp[wrIdx] = val; + } + __syncthreads(); + i /= nGroupsPerWarp; + if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); } + __syncthreads(); + } + return val; } } // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh index 4158ac3179..3b4a8d4174 100644 --- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh +++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh @@ -20,176 +20,185 @@ #include namespace raft { - namespace spatial { - namespace knn { - namespace detail { - - template> - struct EpsUnexpL2SqNeighborhood : public BaseClass { - private: - typedef Policy P; - - bool *adj; - DataT eps; - IdxT *vd; - - char *smem; // for final reductions - - DataT acc[P::AccRowsPerTh][P::AccColsPerTh]; - - public: - DI EpsUnexpL2SqNeighborhood(bool *_adj, - IdxT *_vd, - const DataT *_x, - const DataT *_y, - IdxT _m, - IdxT _n, - IdxT _k, - DataT _eps, - char *_smem) - : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem) { - } - - DI void run() { - prolog(); - loop(); - epilog(); - } - - private: - DI void prolog() { - this->ldgXY(0); +namespace spatial { +namespace knn { +namespace detail { + +template > +struct EpsUnexpL2SqNeighborhood : public BaseClass { + private: + typedef Policy P; + + bool* adj; + DataT eps; + IdxT* vd; + + char* smem; // for final reductions + + DataT acc[P::AccRowsPerTh][P::AccColsPerTh]; + + public: + DI EpsUnexpL2SqNeighborhood(bool* _adj, + IdxT* _vd, + const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + DataT _eps, + char* _smem) + : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem) + { + } + + DI void run() + { + prolog(); + loop(); + epilog(); + } + + private: + DI void prolog() + { + this->ldgXY(0); #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = BaseClass::Zero; - } - } - this->stsXY(); - __syncthreads(); - this->pageWr ^= 1; - } - - DI void loop() { - for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { - this->ldgXY(kidx); - accumulate(); // on the previous k-block - this->stsXY(); - __syncthreads(); - this->pageWr ^= 1; - this->pageRd ^= 1; - } - accumulate(); // last iteration - } - - DI void epilog() { - IdxT startx = blockIdx.x * P::Mblk + this->accrowid; - IdxT starty = blockIdx.y * P::Nblk + this->acccolid; - auto lid = raft::laneId(); - IdxT sums[P::AccColsPerTh]; + for (int j = 0; j < P::AccColsPerTh; ++j) { + acc[i][j] = BaseClass::Zero; + } + } + this->stsXY(); + __syncthreads(); + this->pageWr ^= 1; + } + + DI void loop() + { + for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { + this->ldgXY(kidx); + accumulate(); // on the previous k-block + this->stsXY(); + __syncthreads(); + this->pageWr ^= 1; + this->pageRd ^= 1; + } + accumulate(); // last iteration + } + + DI void epilog() + { + IdxT startx = blockIdx.x * P::Mblk + this->accrowid; + IdxT starty = blockIdx.y * P::Nblk + this->acccolid; + auto lid = raft::laneId(); + IdxT sums[P::AccColsPerTh]; #pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - sums[j] = 0; - } + for (int j = 0; j < P::AccColsPerTh; ++j) { + sums[j] = 0; + } #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto xid = startx + i * P::AccThRows; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + auto xid = startx + i * P::AccThRows; #pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - auto yid = starty + j * P::AccThCols; - auto is_neigh = acc[i][j] <= eps; - ///@todo: fix uncoalesced writes using shared mem - if (xid < this->m && yid < this->n) { - adj[xid * this->n + yid] = is_neigh; - sums[j] += is_neigh; - } - } - } - // perform reduction of adjacency values to compute vertex degrees - if (vd != nullptr) { updateVertexDegree(sums); } - } - - DI void accumulate() { -#pragma unroll - for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { - this->ldsXY(ki); + for (int j = 0; j < P::AccColsPerTh; ++j) { + auto yid = starty + j * P::AccThCols; + auto is_neigh = acc[i][j] <= eps; + ///@todo: fix uncoalesced writes using shared mem + if (xid < this->m && yid < this->n) { + adj[xid * this->n + yid] = is_neigh; + sums[j] += is_neigh; + } + } + } + // perform reduction of adjacency values to compute vertex degrees + if (vd != nullptr) { updateVertexDegree(sums); } + } + + DI void accumulate() + { #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { + this->ldsXY(ki); #pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - auto diff = this->regx[i][v] - this->regy[j][v]; - acc[i][j] += diff * diff; - } - } - } - } - } - - DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh]) { - __syncthreads(); // so that we can safely reuse smem - int gid = threadIdx.x / P::AccThCols; - int lid = threadIdx.x % P::AccThCols; - auto cidx = IdxT(blockIdx.y) * P::Nblk + lid; - IdxT totalSum = 0; - // update the individual vertex degrees + for (int j = 0; j < P::AccColsPerTh; ++j) { #pragma unroll - for (int i = 0; i < P::AccColsPerTh; ++i) { - sums[i] = batchedBlockReduce(sums[i], smem); - auto cid = cidx + i * P::AccThCols; - if (gid == 0 && cid < this->n) { - atomicUpdate(cid, sums[i]); - totalSum += sums[i]; - } - __syncthreads(); // for safe smem reuse - } - // update the total edge count - totalSum = raft::blockReduce(totalSum, smem); - if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); } - } - - DI void atomicUpdate(IdxT addrId, IdxT val) { - if (sizeof(IdxT) == 4) { - raft::myAtomicAdd((unsigned *) (vd + addrId), val); - } else if (sizeof(IdxT) == 8) { - raft::myAtomicAdd((unsigned long long *) (vd + addrId), val); - } - } - }; // struct EpsUnexpL2SqNeighborhood - - template - __global__ __launch_bounds__(Policy::Nthreads, - 2) - - void epsUnexpL2SqNeighKernel( - bool *adj, IdxT *vd, const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, DataT eps) { - extern __shared__ char smem[]; - EpsUnexpL2SqNeighborhood obj(adj, vd, x, y, m, n, k, eps, smem); - obj.run(); - } - - template - void epsUnexpL2SqNeighImpl(bool *adj, - IdxT *vd, - const DataT *x, - const DataT *y, - IdxT m, - IdxT n, - IdxT k, - DataT eps, - cudaStream_t stream) { - typedef typename raft::linalg::Policy4x4::Policy Policy; - dim3 grid(raft::ceildiv(m, Policy::Mblk), raft::ceildiv(n, Policy::Nblk)); - dim3 blk(Policy::Nthreads); - epsUnexpL2SqNeighKernel < DataT, IdxT, Policy > - <<>>(adj, vd, x, y, m, n, k, eps); - RAFT_CUDA_TRY(cudaGetLastError()); + for (int v = 0; v < P::Veclen; ++v) { + auto diff = this->regx[i][v] - this->regy[j][v]; + acc[i][j] += diff * diff; + } } + } + } + } + + DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh]) + { + __syncthreads(); // so that we can safely reuse smem + int gid = threadIdx.x / P::AccThCols; + int lid = threadIdx.x % P::AccThCols; + auto cidx = IdxT(blockIdx.y) * P::Nblk + lid; + IdxT totalSum = 0; + // update the individual vertex degrees +#pragma unroll + for (int i = 0; i < P::AccColsPerTh; ++i) { + sums[i] = batchedBlockReduce(sums[i], smem); + auto cid = cidx + i * P::AccThCols; + if (gid == 0 && cid < this->n) { + atomicUpdate(cid, sums[i]); + totalSum += sums[i]; + } + __syncthreads(); // for safe smem reuse + } + // update the total edge count + totalSum = raft::blockReduce(totalSum, smem); + if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); } + } + + DI void atomicUpdate(IdxT addrId, IdxT val) + { + if (sizeof(IdxT) == 4) { + raft::myAtomicAdd((unsigned*)(vd + addrId), val); + } else if (sizeof(IdxT) == 8) { + raft::myAtomicAdd((unsigned long long*)(vd + addrId), val); + } + } +}; // struct EpsUnexpL2SqNeighborhood + +template +__global__ __launch_bounds__(Policy::Nthreads, 2) + + void epsUnexpL2SqNeighKernel( + bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps) +{ + extern __shared__ char smem[]; + EpsUnexpL2SqNeighborhood obj(adj, vd, x, y, m, n, k, eps, smem); + obj.run(); +} + +template +void epsUnexpL2SqNeighImpl(bool* adj, + IdxT* vd, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) +{ + typedef typename raft::linalg::Policy4x4::Policy Policy; + dim3 grid(raft::ceildiv(m, Policy::Mblk), raft::ceildiv(n, Policy::Nblk)); + dim3 blk(Policy::Nthreads); + epsUnexpL2SqNeighKernel + <<>>(adj, vd, x, y, m, n, k, eps); + RAFT_CUDA_TRY(cudaGetLastError()); +} /** * @brief Computes epsilon neighborhood for the L2-Squared distance metric @@ -208,26 +217,27 @@ namespace raft { * @param[in] fop device lambda to do any other custom functions * @param[in] stream cuda stream */ - template - void epsUnexpL2SqNeighborhood(bool *adj, - IdxT *vd, - const DataT *x, - const DataT *y, - IdxT m, - IdxT n, - IdxT k, - DataT eps, - cudaStream_t stream) { - size_t bytes = sizeof(DataT) * k; - if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { - epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); - } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { - epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); - } else { - epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); - } - } - } // namespace detail - } // namespace knn - } // namespace spatial +template +void epsUnexpL2SqNeighborhood(bool* adj, + IdxT* vd, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + DataT eps, + cudaStream_t stream) +{ + size_t bytes = sizeof(DataT) * k; + if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } else { + epsUnexpL2SqNeighImpl(adj, vd, x, y, m, n, k, eps, stream); + } +} +} // namespace detail +} // namespace knn +} // namespace spatial } // namespace raft diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp index 6ef95dc010..a25fd9295c 100644 --- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp +++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp @@ -39,18 +39,19 @@ namespace knn { * @param[in] fop device lambda to do any other custom functions * @param[in] stream cuda stream */ -template -void epsUnexpL2SqNeighborhood(bool *adj, - IdxT *vd, - const DataT *x, - const DataT *y, +template +void epsUnexpL2SqNeighborhood(bool* adj, + IdxT* vd, + const DataT* x, + const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps, - cudaStream_t stream) { - detail::epsUnexpL2SqNeighborhood(adj, vd, x, y, m, n, k, eps, stream); + cudaStream_t stream) +{ + detail::epsUnexpL2SqNeighborhood(adj, vd, x, y, m, n, k, eps, stream); } -} // namespace knn +} // namespace knn } // namespace spatial } // namespace raft diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp new file mode 100644 index 0000000000..765f07a012 --- /dev/null +++ b/cpp/include/raft/stats/common.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +// This file is a shameless amalgamation of independent works done by +// Lars Nyland and Andy Adinets + +///@todo: add cub's histogram as another option + +namespace raft { +namespace stats { + +/** Default mapper which just returns the value of the data itself */ +template +struct IdentityBinner { + DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } +}; + +/** Types of support histogram implementations */ +enum HistType { + /** shared mem atomics but with bins to be 1b int's */ + HistTypeSmemBits1 = 1, + /** shared mem atomics but with bins to be 2b int's */ + HistTypeSmemBits2 = 2, + /** shared mem atomics but with bins to be 4b int's */ + HistTypeSmemBits4 = 4, + /** shared mem atomics but with bins to ba 1B int's */ + HistTypeSmemBits8 = 8, + /** shared mem atomics but with bins to be 2B int's */ + HistTypeSmemBits16 = 16, + /** use only global atomics */ + HistTypeGmem, + /** uses shared mem atomics to reduce global traffic */ + HistTypeSmem, + /** + * uses shared mem atomics with match_any intrinsic to further reduce shared + * memory traffic. This can only be enabled on Volta and later architectures. + * If one tries to enable this for older arch's, it will fall back to + * `HistTypeSmem`. + * @note This is to be used only when the input dataset leads to a lot of + * repetitions in a given warp, else, this algo can be much slower than + * `HistTypeSmem`! + */ + HistTypeSmemMatchAny, + /** builds a hashmap of active bins in shared mem */ + HistTypeSmemHash, + /** decide at runtime the best algo for the given inputs */ + HistTypeAuto +}; +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh index 8c69ba1459..7c03561002 100644 --- a/cpp/include/raft/stats/detail/histogram.cuh +++ b/cpp/include/raft/stats/detail/histogram.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -31,44 +32,6 @@ namespace raft { namespace stats { namespace detail { -/** Default mapper which just returns the value of the data itself */ -template -struct IdentityBinner { - DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } -}; - -/** Types of support histogram implementations */ -enum HistType { - /** shared mem atomics but with bins to be 1b int's */ - HistTypeSmemBits1 = 1, - /** shared mem atomics but with bins to be 2b int's */ - HistTypeSmemBits2 = 2, - /** shared mem atomics but with bins to be 4b int's */ - HistTypeSmemBits4 = 4, - /** shared mem atomics but with bins to ba 1B int's */ - HistTypeSmemBits8 = 8, - /** shared mem atomics but with bins to be 2B int's */ - HistTypeSmemBits16 = 16, - /** use only global atomics */ - HistTypeGmem, - /** uses shared mem atomics to reduce global traffic */ - HistTypeSmem, - /** - * uses shared mem atomics with match_any intrinsic to further reduce shared - * memory traffic. This can only be enabled on Volta and later architectures. - * If one tries to enable this for older arch's, it will fall back to - * `HistTypeSmem`. - * @note This is to be used only when the input dataset leads to a lot of - * repetitions in a given warp, else, this algo can be much slower than - * `HistTypeSmem`! - */ - HistTypeSmemMatchAny, - /** builds a hashmap of active bins in shared mem */ - HistTypeSmemHash, - /** decide at runtime the best algo for the given inputs */ - HistTypeAuto -}; - static const int ThreadsPerBlock = 256; template diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh index c2b14f1544..2a4a9bff93 100644 --- a/cpp/include/raft/stats/detail/minmax.cuh +++ b/cpp/include/raft/stats/detail/minmax.cuh @@ -23,16 +23,11 @@ namespace raft { namespace stats { - namespace detail { // TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it template -constexpr To - -bit_cast(const From& from) - - noexcept +constexpr To bit_cast(const From& from) noexcept { To to{}; static_assert(sizeof(To) == sizeof(From)); @@ -40,8 +35,6 @@ bit_cast(const From& from) return to; } -} // namespace detail - template struct encode_traits { }; @@ -243,5 +236,4 @@ void minmax(const T* data, }; // end namespace detail }; // end namespace stats -} -; // end namespace raft +}; // end namespace raft diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp index 30e982115a..d4d3b449f7 100644 --- a/cpp/include/raft/stats/histogram.hpp +++ b/cpp/include/raft/stats/histogram.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include // This file is a shameless amalgamation of independent works done by @@ -25,6 +26,7 @@ namespace raft { namespace stats { + /** * @brief Perform histogram on the input data. It chooses the right load size * based on the input data vector length. It also supports large-bin cases diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu index be62cf0208..7667f742ca 100644 --- a/cpp/test/spatial/epsilon_neighborhood.cu +++ b/cpp/test/spatial/epsilon_neighborhood.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index 02e01deec3..2db64a7999 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu index b89e2ab208..ff538fcdca 100644 --- a/cpp/test/stats/histogram.cu +++ b/cpp/test/stats/histogram.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include namespace raft { namespace stats { diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu index 777ac800e9..61b16b65ae 100644 --- a/cpp/test/stats/minmax.cu +++ b/cpp/test/stats/minmax.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu index df77a19d73..ee58747b69 100644 --- a/cpp/test/stats/weighted_mean.cu +++ b/cpp/test/stats/weighted_mean.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include From 895c1c4e8ff64d065ddd8ae560b09cc0627901bd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 19:43:53 -0500 Subject: [PATCH 17/24] Fixing style --- cpp/test/CMakeLists.txt | 1 + cpp/test/spatial/epsilon_neighborhood.cu | 207 +++++++++++------------ 2 files changed, 99 insertions(+), 109 deletions(-) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index fe51e5af02..430b69341c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -107,6 +107,7 @@ add_executable(test_raft test/spatial/fused_l2_knn.cu test/spatial/haversine.cu test/spatial/ball_cover.cu + test/spatial/epsilon_neighborhood.cu test/spatial/faiss_mr.cu test/spatial/selection.cu test/spectral_matrix.cu diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu index 7667f742ca..b39148957e 100644 --- a/cpp/test/spatial/epsilon_neighborhood.cu +++ b/cpp/test/spatial/epsilon_neighborhood.cu @@ -15,125 +15,114 @@ */ #include "test_utils.h" -#include #include #include #include #include +#include #include namespace raft { namespace spatial { namespace knn { - template - struct EpsInputs { - IdxT n_row, n_col, n_centers, n_batches; - T eps; - }; - - template - ::std::ostream &operator<<(::std::ostream &os, const EpsInputs &p) { - return os; - } - - template - class EpsNeighTest : public ::testing::TestWithParam> { - protected: - EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {} - - void SetUp() override { - param = ::testing::TestWithParam>::GetParam(); - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - data.resize(param.n_row * param.n_col, stream); - labels.resize(param.n_row, stream); - batchSize = param.n_row / param.n_batches; - adj.resize(param.n_row * batchSize, stream); - vd.resize(batchSize + 1, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream)); - Random::make_blobs(data.data(), - labels.data(), - param.n_row, - param.n_col, - param.n_centers, - stream, - true, - nullptr, - nullptr, - T(0.01), - false); - } - - void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } - - EpsInputs param; - cudaStream_t stream = 0; - rmm::device_uvector data; - rmm::device_uvector adj; - rmm::device_uvector labels, vd; - IdxT batchSize; - }; // class EpsNeighTest - - const std::vector > inputsfi = { - {15000, 16, 5, 1, 2.f}, - {14000, 16, 5, 1, 2.f}, - {15000, 17, 5, 1, 2.f}, - {14000, 17, 5, 1, 2.f}, - {15000, 18, 5, 1, 2.f}, - {14000, 18, 5, 1, 2.f}, - {15000, 32, 5, 1, 2.f}, - {14000, 32, 5, 1, 2.f}, - {20000, 10000, 10, 1, 2.f}, - {20000, 10000, 10, 2, 2.f}, - }; - typedef EpsNeighTest EpsNeighTestFI; - TEST_P(EpsNeighTestFI, Result - ) { - for ( - int i = 0; - i(adj - . - - data(), - vd - - . - - data(), - data - - . - - data(), - data - - . - - data() - - + ( - i *batchSize - * param.n_col), - param.n_row, - batchSize, - param.n_col, - param. - eps *param - .eps, - stream); - ASSERT_TRUE(raft::devArrMatch( - param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare(), stream) - ); - } +template +struct EpsInputs { + IdxT n_row, n_col, n_centers, n_batches; + T eps; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const EpsInputs& p) +{ + return os; +} + +template +class EpsNeighTest : public ::testing::TestWithParam> { + protected: + EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {} + + void SetUp() override + { + param = ::testing::TestWithParam>::GetParam(); + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + data.resize(param.n_row * param.n_col, stream); + labels.resize(param.n_row, stream); + batchSize = param.n_row / param.n_batches; + adj.resize(param.n_row * batchSize, stream); + vd.resize(batchSize + 1, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream)); + random::make_blobs(data.data(), + labels.data(), + param.n_row, + param.n_col, + param.n_centers, + stream, + true, + nullptr, + nullptr, + T(0.01), + false); + } + + void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } + + EpsInputs param; + cudaStream_t stream = 0; + rmm::device_uvector data; + rmm::device_uvector adj; + rmm::device_uvector labels, vd; + IdxT batchSize; +}; // class EpsNeighTest + +const std::vector> inputsfi = { + {15000, 16, 5, 1, 2.f}, + {14000, 16, 5, 1, 2.f}, + {15000, 17, 5, 1, 2.f}, + {14000, 17, 5, 1, 2.f}, + {15000, 18, 5, 1, 2.f}, + {14000, 18, 5, 1, 2.f}, + {15000, 32, 5, 1, 2.f}, + {14000, 32, 5, 1, 2.f}, + {20000, 10000, 10, 1, 2.f}, + {20000, 10000, 10, 2, 2.f}, +}; +typedef EpsNeighTest EpsNeighTestFI; +TEST_P(EpsNeighTestFI, Result) +{ + for (int i = 0; i < param.n_batches; ++i) { + RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream)); + epsUnexpL2SqNeighborhood(adj. + + data(), + vd + + . + + data(), + data + + . + + data(), + data + + . + + data() + + + (i * batchSize * param.n_col), + param.n_row, + batchSize, + param.n_col, + param.eps * param.eps, + stream); + ASSERT_TRUE(raft::devArrMatch( + param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare(), stream)); + } } -INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi) -); +INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi)); }; // namespace knn }; // namespace spatial From ce425d4d89ebdcf1727f2b068a76364333161245 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 19:56:25 -0500 Subject: [PATCH 18/24] Updating year andeive --- cpp/include/raft/common/{seive.cuh => seive.hpp} | 3 +-- cpp/include/raft/stats/detail/histogram.cuh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) rename cpp/include/raft/common/{seive.cuh => seive.hpp} (97%) diff --git a/cpp/include/raft/common/seive.cuh b/cpp/include/raft/common/seive.hpp similarity index 97% rename from cpp/include/raft/common/seive.cuh rename to cpp/include/raft/common/seive.hpp index 580d9d91cb..6d7de24ecd 100644 --- a/cpp/include/raft/common/seive.cuh +++ b/cpp/include/raft/common/seive.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #pragma once -#include #include // Taken from: diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh index 7c03561002..65241f524f 100644 --- a/cpp/include/raft/stats/detail/histogram.cuh +++ b/cpp/include/raft/stats/detail/histogram.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include #include From 3d0b16769578b4c71e6e6fbbe243f6128b9dcc49 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 19:58:08 -0500 Subject: [PATCH 19/24] proper include --- cpp/test/spatial/epsilon_neighborhood.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu index b39148957e..33af5726a0 100644 --- a/cpp/test/spatial/epsilon_neighborhood.cu +++ b/cpp/test/spatial/epsilon_neighborhood.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include From 368513b14e84aa4af50a3f60adbba794596b7450 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 20:59:19 -0500 Subject: [PATCH 20/24] Proper filename --- cpp/test/common/seive.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu index ca46397b19..8044dbb532 100644 --- a/cpp/test/common/seive.cu +++ b/cpp/test/common/seive.cu @@ -15,7 +15,7 @@ */ #include -#include +#include namespace raft { namespace common { From 8b5e5c50006161925739779dd0a07b86c04a1f8b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 10 Feb 2022 22:05:32 -0500 Subject: [PATCH 21/24] Adding missing include --- cpp/include/raft/common/seive.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp index 6d7de24ecd..e613f1e5c2 100644 --- a/cpp/include/raft/common/seive.hpp +++ b/cpp/include/raft/common/seive.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include // Taken from: From 413b0d462e5a57b2ec0fabb538dd7d2bdfa3dd10 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 11 Feb 2022 15:03:09 -0500 Subject: [PATCH 22/24] Updating rsvd test --- cpp/include/raft/random/detail/make_blobs.cuh | 133 +++++++++++++----- cpp/test/linalg/rsvd.cu | 112 ++++++++------- cpp/test/random/make_blobs.cu | 110 ++++++--------- 3 files changed, 206 insertions(+), 149 deletions(-) diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh index 528d20a284..fff1ab835b 100644 --- a/cpp/include/raft/random/detail/make_blobs.cuh +++ b/cpp/include/raft/random/detail/make_blobs.cuh @@ -16,18 +16,18 @@ #pragma once +#include "permute.cuh" #include #include #include -#include #include #include #include -namespace raft::random { -namespace detail { +namespace raft { +namespace random { -namespace { +namespace detail { // generate the labels first and shuffle them instead of shuffling the dataset template @@ -90,23 +90,29 @@ DI void get_mu_sigma(DataT& mu, } template -void generate_data(DataT* out, - const IdxT* labels, - IdxT n_rows, - IdxT n_cols, - IdxT n_clusters, - cudaStream_t stream, - bool row_major, - const DataT* centers, - const DataT* cluster_std, - const DataT cluster_std_scalar, - raft::random::Rng& rng) +__global__ void generate_data_kernel(DataT* out, + const IdxT* labels, + IdxT n_rows, + IdxT n_cols, + IdxT n_clusters, + bool row_major, + const DataT* centers, + const DataT* cluster_std, + const DataT cluster_std_scalar, + raft::random::RngState rng_state) { - auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) { + uint64_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; + raft::random::PhiloxGenerator gen(rng_state, tid); + const IdxT stride = gridDim.x * blockDim.x; + IdxT len = n_rows * n_cols; + for (IdxT idx = tid; idx < len; idx += stride) { + DataT val1, val2; + gen.next(val1); + gen.next(val2); DataT mu1, sigma1, mu2, sigma2; get_mu_sigma(mu1, sigma1, - idx1, + idx, labels, row_major, centers, @@ -117,7 +123,7 @@ void generate_data(DataT* out, n_clusters); get_mu_sigma(mu2, sigma2, - idx2, + idx + stride, labels, row_major, centers, @@ -127,12 +133,74 @@ void generate_data(DataT* out, n_cols, n_clusters); raft::random::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2); - }; - rng.custom_distribution2(out, n_rows * n_cols, op, stream); + + if (idx < len) out[idx] = val1; + idx += stride; + if (idx < len) out[idx] = val2; + } } -} // namespace +template +void generate_data(DataT* out, + const IdxT* labels, + IdxT n_rows, + IdxT n_cols, + IdxT n_clusters, + cudaStream_t stream, + bool row_major, + const DataT* centers, + const DataT* cluster_std, + const DataT cluster_std_scalar, + raft::random::RngState& rng_state) +{ + IdxT items = n_rows * n_cols; + IdxT nBlocks = (items + 127) / 128; + generate_data_kernel<<>>(out, + labels, + n_rows, + n_cols, + n_clusters, + row_major, + centers, + cluster_std, + cluster_std_scalar, + rng_state); +} +/** + * @brief GPU-equivalent of sklearn.datasets.make_blobs + * + * @tparam DataT output data type + * @tparam IdxT indexing arithmetic type + * + * @param[out] out generated data [on device] + * [dim = n_rows x n_cols] + * @param[out] labels labels for the generated data [on device] + * [len = n_rows] + * @param[in] n_rows number of rows in the generated data + * @param[in] n_cols number of columns in the generated data + * @param[in] n_clusters number of clusters (or classes) to generate + * @param[in] stream cuda stream to schedule the work on + * @param[in] row_major whether input `centers` and output `out` + * buffers are to be stored in row or column + * major layout + * @param[in] centers centers of each of the cluster, pass a nullptr + * if you need this also to be generated randomly + * [on device] [dim = n_clusters x n_cols] + * @param[in] cluster_std standard deviation of each cluster center, + * pass a nullptr if this is to be read from the + * `cluster_std_scalar`. [on device] + * [len = n_clusters] + * @param[in] cluster_std_scalar if 'cluster_std' is nullptr, then use this as + * the std-dev across all dimensions. + * @param[in] shuffle shuffle the generated dataset and labels + * @param[in] center_box_min min value of box from which to pick cluster + * centers. Useful only if 'centers' is nullptr + * @param[in] center_box_max max value of box from which to pick cluster + * centers. Useful only if 'centers' is nullptr + * @param[in] seed seed for the RNG + * @param[in] type RNG type + */ template void make_blobs_caller(DataT* out, IdxT* labels, @@ -140,15 +208,15 @@ void make_blobs_caller(DataT* out, IdxT n_cols, IdxT n_clusters, cudaStream_t stream, - bool row_major = true, - const DataT* centers = nullptr, - const DataT* cluster_std = nullptr, - const DataT cluster_std_scalar = (DataT)1.0, - bool shuffle = true, - DataT center_box_min = (DataT)-10.0, - DataT center_box_max = (DataT)10.0, - uint64_t seed = 0ULL, - raft::random::GeneratorType type = raft::random::GenPhilox) + bool row_major, + const DataT* centers, + const DataT* cluster_std, + const DataT cluster_std_scalar, + bool shuffle, + DataT center_box_min, + DataT center_box_max, + uint64_t seed, + raft::random::GeneratorType type) { raft::random::Rng r(seed, type); // use the right centers buffer for data generation @@ -172,8 +240,9 @@ void make_blobs_caller(DataT* out, _centers, cluster_std, cluster_std_scalar, - r); + r.state); } } // end namespace detail -} // end namespace raft::random \ No newline at end of file +} // end namespace random +} // end namespace raft \ No newline at end of file diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu index b8e44580b5..da38464bf7 100644 --- a/cpp/test/linalg/rsvd.cu +++ b/cpp/test/linalg/rsvd.cu @@ -31,6 +31,7 @@ struct RsvdInputs { T tolerance; int n_row; int n_col; + float redundancy; T PC_perc; T UpS_perc; int k; @@ -66,7 +67,7 @@ class RsvdTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); // rSVD seems to be very sensitive to the random number sequence as well! - raft::random::Rng r(params.seed, raft::random::GenTaps); + raft::random::Rng r(params.seed, raft::random::GenPC); int m = params.n_row, n = params.n_col; T eig_svd_tol = 1.e-7; int max_sweeps = 100; @@ -91,8 +92,19 @@ class RsvdTest : public ::testing::TestWithParam> { raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream); raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream); - } else { // Other normal tests - r.normal(A.data(), m * n, mu, sigma, stream); + } else { // Other normal tests + int n_informative = int(0.25f * n); // Informative cols + int len_informative = m * n_informative; + + int n_redundant = n - n_informative; // Redundant cols + int len_redundant = m * n_redundant; + + r.normal(A.data(), len_informative, mu, sigma, stream); + CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative, + A.data(), + len_redundant * sizeof(T), + cudaMemcpyDeviceToDevice, + stream)); } std::vector A_backup_cpu(m * n); // Backup A matrix as svdJacobi will destroy the content of A @@ -157,59 +169,65 @@ class RsvdTest : public ::testing::TestWithParam> { const std::vector> inputs_fx = { // Test with ratios - {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT - {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT - {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT - {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT - - , // Test with fixed ranks - {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT - {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT - {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT - {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT - {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT - {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT - {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT - {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT + {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + + {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Tall + non-BBT + + {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Square + BBT + {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, // Tall + BBT + + {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + + {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Tall + non-BBT + + {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Square + BBT + {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL}, // Tall + BBT + + {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL} // Tall + non-BBT }; const std::vector> inputs_dx = { // Test with ratios - {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT - {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT - {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT - {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT - {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT - {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT - - , // Test with fixed ranks - {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT - {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT - {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT - {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT - {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT - {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT - {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT - {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT + {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, // Tall + non-BBT + {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Square + BBT + {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, // Tall + BBT + {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, // Square + non-BBT + {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL} // Tall + non-BBT + + , // Test with fixed ranks + {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}, // Tall + non-BBT + {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Square + BBT + {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL}, // Tall + BBT + {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}, // Square + non-BBT + {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL} // Tall + non-BBT }; const std::vector> sanity_inputs_fx = { - {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL}, - {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; + {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, true, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}, + {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, false, 4321ULL}}; const std::vector> sanity_inputs_dx = { - {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL}, - {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL}, - {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL}, - {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}}; + {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, true, 4321ULL}, + {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}, + {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, false, 4321ULL}}; typedef RsvdTest RsvdSanityCheckValF; TEST_P(RsvdSanityCheckValF, Result) diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu index 8c7e440d0e..b1ce4b3236 100644 --- a/cpp/test/random/make_blobs.cu +++ b/cpp/test/random/make_blobs.cu @@ -14,14 +14,15 @@ * limitations under the License. */ -#include "../test_utils.h" +#include "test_utils.h" #include #include #include #include #include -namespace raft::random { +namespace raft { +namespace random { template __global__ void meanKernel(T* out, @@ -136,8 +137,8 @@ class MakeBlobsTest : public ::testing::TestWithParam> { { int len = params.n_clusters * params.cols; auto compare = raft::CompareApprox(num_sigma * params.tolerance); - ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream)); - ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream)); + ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare)); + ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare)); } protected: @@ -153,53 +154,37 @@ typedef MakeBlobsTest MakeBlobsTestF; const std::vector> inputsf_t = { {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, - + {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL}, + + {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPC, 1234ULL}, }; TEST_P(MakeBlobsTestF, Result) { check(); } @@ -209,55 +194,40 @@ typedef MakeBlobsTest MakeBlobsTestD; const std::vector> inputsd_t = { {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPC, 1234ULL}, {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL}, - {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, - {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL}, + {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPC, 1234ULL}, + {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPC, 1234ULL}, }; TEST_P(MakeBlobsTestD, Result) { check(); } INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t)); -} // end namespace raft::random \ No newline at end of file +} // end namespace random +} // end namespace raft \ No newline at end of file From 02341d60149d1eb83da28a748e564b7da76a1603 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 11 Feb 2022 15:09:32 -0500 Subject: [PATCH 23/24] iFixing style --- cpp/test/random/make_blobs.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu index b1ce4b3236..caad627d49 100644 --- a/cpp/test/random/make_blobs.cu +++ b/cpp/test/random/make_blobs.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "test_utils.h" +#include "../test_utils.h" #include #include #include From a3c59da08a8d01ca411aedcddfac26fe477783e5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 11 Feb 2022 16:18:31 -0500 Subject: [PATCH 24/24] Fixing docs --- cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp index a25fd9295c..cd9163096a 100644 --- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp +++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp @@ -34,9 +34,11 @@ namespace knn { * matrix. Pass a nullptr if you don't need this info. * @param[in] x first matrix [row-major] [on device] [dim = m x k] * @param[in] y second matrix [row-major] [on device] [dim = n x k] + * @param[in] m number of rows in x + * @param[in] n number of rows in y + * @param[in] k number of columns in x and k * @param[in] eps defines epsilon neighborhood radius (should be passed as * squared as we compute L2-squared distance in this method) - * @param[in] fop device lambda to do any other custom functions * @param[in] stream cuda stream */ template