From 478ddaca97a25e20f1f44f865693e507028f969b Mon Sep 17 00:00:00 2001 From: divyegala Date: Thu, 21 Oct 2021 15:16:54 -0700 Subject: [PATCH 01/17] working through --- cpp/include/raft/linalg/add.cuh | 19 +----- cpp/include/raft/linalg/binary_op.cuh | 55 +++------------- cpp/include/raft/linalg/detail/add.cuh | 47 ++++++++++++++ cpp/include/raft/linalg/detail/binary_op.cuh | 64 +++++++++++++++++++ .../raft/linalg/detail/cholesky_r1_update.cuh | 0 5 files changed, 123 insertions(+), 62 deletions(-) create mode 100644 cpp/include/raft/linalg/detail/add.cuh create mode 100644 cpp/include/raft/linalg/detail/binary_op.cuh create mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.cuh diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 7a454f64e2..9aa1e6e82a 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -16,6 +16,8 @@ #pragma once +#include "detail/add.cuh" + #include "binary_op.cuh" #include "unary_op.cuh" @@ -63,16 +65,6 @@ void add(OutT *out, const InT *in1, const InT *in2, IdxType len, binaryOp(out, in1, in2, len, op, stream); } -template -__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] + *singleScalarDev; - } -} - /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing @@ -86,12 +78,7 @@ template void addDevScalar(math_t *outDev, const math_t *inDev, const math_t *singleScalarDev, IdxType len, cudaStream_t stream) { - // TODO: block dimension has not been tuned - dim3 block(256); - dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); - CUDA_CHECK(cudaPeekAtLastError()); + detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index 940d786e87..eb2831b7e8 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -16,51 +16,14 @@ #pragma once +#include "detail/binary_op.cuh" + #include #include namespace raft { namespace linalg { -template -__global__ void binaryOpKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len, Lambda op) { - typedef TxN_t InVecType; - typedef TxN_t OutVecType; - InVecType a, b; - OutVecType c; - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= InVecType::Ratio; - if (idx >= len) return; - a.load(in1, idx); - b.load(in2, idx); -#pragma unroll - for (int i = 0; i < InVecType::Ratio; ++i) { - c.val.data[i] = op(a.val.data[i], b.val.data[i]); - } - c.store(out, idx); -} - -template -void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, - IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); - binaryOpKernel - <<>>(out, in1, in2, len, op); - CUDA_CHECK(cudaPeekAtLastError()); -} - -/** - * @brief Checks if addresses are aligned on N bytes - */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, - uint64_t N) { - return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; -} - /** * @brief perform element-wise binary operation on the input arrays * @tparam InType input data-type @@ -88,26 +51,26 @@ void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, uint64_t in2Addr = uint64_t(in2); uint64_t outAddr = uint64_t(out); if (16 / maxSize && bytes % 16 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 16)) { - binaryOpImpl( + detail::addressAligned(in1Addr, in2Addr, outAddr, 16)) { + detail::binaryOpImpl( out, in1, in2, len, op, stream); } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { - binaryOpImpl( + detail::binaryOpImpl( out, in1, in2, len, op, stream); } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { - binaryOpImpl( + detail:: binaryOpImpl( out, in1, in2, len, op, stream); } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { - binaryOpImpl( + detail::binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { - binaryOpImpl( + detail::binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, + detail::binaryOpImpl(out, in1, in2, len, op, stream); } } diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh new file mode 100644 index 0000000000..550f1bcde3 --- /dev/null +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, + IdxType len) { +IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; +if (i < len) { + outDev[i] = inDev[i] + *singleScalarDev; +} +} + +template +void addDevScalar(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { +// TODO: block dimension has not been tuned +dim3 block(256); +dim3 grid(raft::ceildiv(len, (IdxType)block.x)); +add_dev_scalar_kernel + <<>>(outDev, inDev, singleScalarDev, len); +CUDA_CHECK(cudaPeekAtLastError()); +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh new file mode 100644 index 0000000000..866bedf1ba --- /dev/null +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #pragma once + + namespace raft { + namespace linalg { + namespace detail { + + template +__global__ void binaryOpKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len, Lambda op) { + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a, b; + OutVecType c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + c.val.data[i] = op(a.val.data[i], b.val.data[i]); + } + c.store(out, idx); +} + +template +void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, + IdxType len, Lambda op, cudaStream_t stream) { + const IdxType nblks = + raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + binaryOpKernel + <<>>(out, in1, in2, len, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Checks if addresses are aligned on N bytes + */ +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, + uint64_t N) { + return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; +} + + } // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh new file mode 100644 index 0000000000..e69de29bb2 From d4b72ba275bd42b68a437d988b729c8d00432ce4 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 5 Nov 2021 15:41:27 -0700 Subject: [PATCH 02/17] working ththrough --- cpp/include/raft/linalg/add.cuh | 9 +- cpp/include/raft/linalg/binary_op.cuh | 1 - .../raft/linalg/cholesky_r1_update.cuh | 89 +---- .../raft/linalg/coalesced_reduction.cuh | 54 +--- cpp/include/raft/linalg/contractions.cuh | 278 +--------------- cpp/include/raft/linalg/detail/binary_op.cuh | 2 + .../raft/linalg/detail/cholesky_r1_update.cuh | 0 .../raft/linalg/detail/cholesky_r1_update.hpp | 119 +++++++ .../linalg/detail/coalesced_reduction.cuh | 89 +++++ .../raft/linalg/detail/contractions.cuh | 303 ++++++++++++++++++ cpp/include/raft/linalg/detail/eig.hpp | 167 ++++++++++ cpp/include/raft/linalg/detail/functional.cuh | 80 +++++ cpp/include/raft/linalg/divide.cuh | 5 +- cpp/include/raft/linalg/eig.cuh | 107 +------ cpp/include/raft/linalg/eltwise.cuh | 27 +- 15 files changed, 794 insertions(+), 536 deletions(-) delete mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.cuh create mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.hpp create mode 100644 cpp/include/raft/linalg/detail/coalesced_reduction.cuh create mode 100644 cpp/include/raft/linalg/detail/contractions.cuh create mode 100644 cpp/include/raft/linalg/detail/eig.hpp create mode 100644 cpp/include/raft/linalg/detail/functional.cuh diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 9aa1e6e82a..de614185c0 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -17,6 +17,7 @@ #pragma once #include "detail/add.cuh" +#include "detail/functional.cuh" #include "binary_op.cuh" #include "unary_op.cuh" @@ -24,6 +25,8 @@ namespace raft { namespace linalg { +using detail::adds_scalar; + /** * @brief Elementwise scalar add operation on the input buffer * @@ -41,8 +44,7 @@ namespace linalg { template void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, cudaStream_t stream) { - auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; - unaryOp(out, in, len, op, stream); + unaryOp(out, in, len, adds_scalar(scalar), stream); } /** @@ -61,8 +63,7 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, template void add(OutT *out, const InT *in1, const InT *in2, IdxType len, cudaStream_t stream) { - auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; - binaryOp(out, in1, in2, len, op, stream); + binaryOp(out, in1, in2, len, thrust::plus(), stream); } /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index eb2831b7e8..56dc9cf623 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -19,7 +19,6 @@ #include "detail/binary_op.cuh" #include -#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index d6d064c20e..e72d3e963f 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -16,11 +16,7 @@ #pragma once -#include -#include -#include -#include -#include +#include "detail/cholesky_r1_update.hpp" namespace raft { namespace linalg { @@ -125,88 +121,7 @@ template void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, void *workspace, int *n_bytes, cublasFillMode_t uplo, cudaStream_t stream, math_t eps = -1) { - // The matrix A' is defined as: - // A' = [[A_11, A_12] - // [A_21, A_22]] - // where: - // - A_11 = A, matrix of size (n-1)x(n-1) - // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements - // - A_22 = A_new[n-1] scalar. - // - // Instead of caclulating the Cholelsky decomposition of A' from scratch, - // we just update L with the new row. The new Cholesky decomposition will be - // calculated as: - // L' = [[L_11, 0] - // [L_12, L_22]] - // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and - // L_12 and L_22 are the new quantities that we need to calculate. - - // We need a workspace in device memory to store a scalar. Additionally, in - // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. - const int align = 256; - int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; - if (workspace == nullptr) { - *n_bytes = offset + 1 * sizeof(math_t); - return; - } - math_t *s = reinterpret_cast(((char *)workspace) + offset); - math_t *L_22 = L + (n - 1) * ld + n - 1; - - math_t *A_new; - math_t *A_row; - if (uplo == CUBLAS_FILL_MODE_UPPER) { - // A_new is stored as the n-1 th column of L - A_new = L + (n - 1) * ld; - } else { - // If the input is lower triangular, then the new elements of A are stored - // as the n-th row of L. Since the matrix is column major, this is non - // contiguous. We copy elements from A_row to a contiguous workspace A_new. - A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_row, ld, A_new, 1, stream)); - } - cublasOperation_t op = - (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; - if (n > 1) { - // Calculate L_12 = x by solving equation L_11 x = A_12 - math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm( - handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, - CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); - - // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); - - if (uplo == CUBLAS_FILL_MODE_LOWER) { - // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_new, 1, A_row, ld, stream)); - } - } else { // n == 1 case - CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); - } - - // L_22 = sqrt(A_22 - L_12 * L_12) - math_t s_host; - math_t L_22_host; - raft::update_host(&s_host, s, 1, stream); - raft::update_host(&L_22_host, L_22, 1, stream); // L_22 stores A_22 - CUDA_CHECK(cudaStreamSynchronize(stream)); - L_22_host = std::sqrt(L_22_host - s_host); - - // Check for numeric error with sqrt. If the matrix is not positive definit or - // the system is very ill conditioned then the A_22 - L_12 * L_12 can be - // negative, which would result L_22 = NaN. A small positive eps parameter - // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { - L_22_host = eps; - } - ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); - raft::update_device(L_22, &L_22_host, 1, stream); + detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps); } }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index ef983ff3d0..d11489bd7e 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -16,42 +16,11 @@ #pragma once -#include -#include +#include "detail/coalesced_reduction.cuh" namespace raft { namespace linalg { -// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension -// of the matrix, i.e. reduce along rows for row major or reduce along columns -// for column major layout. Kernel does an inplace reduction adding to original -// values of dots. -template -__global__ void coalescedReductionKernel(OutType *dots, const InType *data, - int D, int N, OutType init, - MainLambda main_op, - ReduceLambda reduce_op, - FinalLambda final_op, - bool inplace = false) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - OutType thread_data = init; - IdxType rowStart = blockIdx.x * D; - for (IdxType i = threadIdx.x; i < D; i += TPB) { - IdxType idx = rowStart + i; - thread_data = reduce_op(thread_data, main_op(data[idx], i)); - } - OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op); - if (threadIdx.x == 0) { - if (inplace) { - dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc)); - } else { - dots[blockIdx.x] = final_op(acc); - } - } -} - /** * @brief Compute reduction of the input matrix along the leading dimension * @@ -88,26 +57,7 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N, MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - // One block per reduction - // Efficient only for large leading dimensions - if (D <= 32) { - coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); - } else if (D <= 64) { - coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); - } else if (D <= 128) { - coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); - } else { - coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); - } - CUDA_CHECK(cudaPeekAtLastError()); + detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op) } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index e6ff8a49ce..107c8ac3c2 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include "detail/contractions.cuh" namespace raft { namespace linalg { @@ -201,281 +201,7 @@ struct Policy2x8 { * @tparam Policy policy used to customize memory access behavior. * See documentation for `KernelPolicy` to know more. */ -template -struct Contractions_NT { - protected: - typedef Policy P; - - /** number of rows in X */ - IdxT m; - /** number of rows in Y */ - IdxT n; - /** number of columns in X and Y */ - IdxT k; - /** leading dimension in X */ - IdxT lda; - /** leading dimension in Y */ - IdxT ldb; - /** leading dimension in Output D */ - IdxT ldd; - - /** current thread's global mem row id for X data */ - IdxT xrowid; - /** current thread's global mem row id for Y data */ - IdxT yrowid; - /** global memory pointer to X matrix */ - const DataT* x; - /** global memory pointer to Y matrix */ - const DataT* y; - - /** current thread's smem row id */ - int srowid; - /** current thread's smem column id */ - int scolid; - /** current thread's accumulation row id */ - int accrowid; - /** current thread's accumulation column id */ - int acccolid; - - /** base smem pointer for X data storage */ - DataT* sx; - /** base smem pointer for Y data storage */ - DataT* sy; - /** index pointing the correct smem page for writing after `ldgXY()` */ - int pageWr; - /** index pointing the correct smem page for reading during `ldsXY()` */ - int pageRd; - - /** block of X data loaded from smem after `ldsXY()` */ - DataT regx[P::AccRowsPerTh][P::Veclen]; - /** block of Y data loaded from smem after `ldsXY()` */ - DataT regy[P::AccColsPerTh][P::Veclen]; - /** block of X data loaded from global mem after `ldgXY()` */ - DataT ldgDataX[P::LdgPerThX][P::Veclen]; - /** block of Y data loaded from global mem after `ldgXY()` */ - DataT ldgDataY[P::LdgPerThY][P::Veclen]; - - static const DataT Zero = (DataT)0; - - public: - /** - * @brief Ctor - * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] - * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] - * @param[in] _m number of rows of X - * @param[in] _n number of rows of Y - * @param[in] _k number of cols of X and Y - * @param[in] _smem shared memory region used during computations - */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) - : m(_m), - n(_n), - k(_k), - lda(_k), - ldb(_k), - xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow), - yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow), - x(_x + xrowid * lda), - y(_y + yrowid * ldb), - srowid(threadIdx.x / P::LdgThRow), - scolid((threadIdx.x % P::LdgThRow) * P::Veclen), - accrowid(threadIdx.x / P::AccThCols), - acccolid(threadIdx.x % P::AccThCols), - sx((DataT*)_smem), - sy(&(sx[P::SmemPageX])), - pageWr(0), - pageRd(0) {} - - /** - * @brief Ctor - * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] - * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] - * @param[in] _m number of rows of X - * @param[in] _n number of rows of Y - * @param[in] _k number of cols of X and Y - * @param[in] _smem shared memory region used during computations - */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) - : m(_m), - n(_n), - k(_k), - lda(_lda), - ldb(_ldb), - ldd(_ldd), - srowid(threadIdx.x / P::LdgThRow), - scolid((threadIdx.x % P::LdgThRow) * P::Veclen), - accrowid(threadIdx.x / P::AccThCols), - acccolid(threadIdx.x % P::AccThCols), - sx((DataT*)_smem), - sy(&(sx[P::SmemPageX])), - pageWr(0), - pageRd(0) { - if (isRowMajor) { - xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; - yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; - } else { - xrowid = IdxT(blockIdx.y) * P::Mblk; - yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; - } - } - - protected: - /** - * @brief Load current block of X/Y from global memory to registers - * @param[in] kidx current start index of k to be loaded - */ - DI void ldgXY(IdxT kidx) { - ldgX(kidx); - ldgY(kidx); - } - - /** - * @brief Store current block of X/Y from registers to smem - * @param[in] kidx current start index of k to be loaded - */ - DI void stsXY() { - stsX(sx + pageWr * P::SmemPage); - stsY(sy + pageWr * P::SmemPage); - } - - /** - * @brief Load X and Y block from shared memory to registers - * @param[in] kidx k value from the current k-block to be loaded from smem - */ - DI void ldsXY(int kidx) { - ldsX(kidx, sx + pageRd * P::SmemPage); - ldsY(kidx, sy + pageRd * P::SmemPage); - } - - private: - DI void ldgX(IdxT kidx) { - if (isRowMajor) { - auto numRows = m; - auto koffset = kidx + scolid; -#pragma unroll - for (int i = 0; i < P::LdgPerThX; ++i) { - if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) { - ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset); - } else { -#pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataX[i][j] = Zero; - } - } - } - } else { - const auto numRows = k; - auto koffset = scolid; -#pragma unroll - for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { - ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); - } else { -#pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataX[i][j] = Zero; - } - } - } - } - } - - DI void ldgY(IdxT kidx) { - if (isRowMajor) { - auto numRows = n; - auto koffset = kidx + scolid; -#pragma unroll - for (int i = 0; i < P::LdgPerThY; ++i) { - if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) { - ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset); - } else { -#pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataY[i][j] = Zero; - } - } - } - } else { - auto numRows = k; - auto koffset = scolid; -#pragma unroll - for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { - ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); - } else { -#pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataY[i][j] = Zero; - } - } - } - } - } - - DI void stsX(DataT* smem) { - auto* saddr = smem + srowid * P::SmemStride + scolid; -#pragma unroll - for (int i = 0; i < P::LdgPerThX; ++i) { - sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]); - } - } - - DI void stsY(DataT* smem) { - auto* saddr = smem + srowid * P::SmemStride + scolid; -#pragma unroll - for (int i = 0; i < P::LdgPerThY; ++i) { - sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]); - } - } - - DI void ldsX(int kidx, DataT* smem) { - if (isRowMajor) { - auto* saddr = smem + accrowid * P::SmemStride + kidx; -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - lds(regx[i], saddr + i * P::AccThRows * P::SmemStride); - } - } else { - auto* saddr = smem + accrowid + kidx * P::SmemStride; -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride]; - } - } - } - } - - DI void ldsY(int kidx, DataT* smem) { - if (isRowMajor) { - auto* saddr = smem + acccolid * P::SmemStride + kidx; -#pragma unroll - for (int i = 0; i < P::AccColsPerTh; ++i) { - lds(regy[i], saddr + i * P::AccThCols * P::SmemStride); - } - } else { - auto* saddr = smem + acccolid + kidx * P::SmemStride; -#pragma unroll - for (int i = 0; i < P::AccColsPerTh; ++i) { -#pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride]; - } - } - } - } - -}; // struct Contractions_NT +using detail::Contractions_NT; } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh index 866bedf1ba..89876afe46 100644 --- a/cpp/include/raft/linalg/detail/binary_op.cuh +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -16,6 +16,8 @@ #pragma once + #include + namespace raft { namespace linalg { namespace detail { diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp new file mode 100644 index 0000000000..b5f81579a6 --- /dev/null +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, + void *workspace, int *n_bytes, cublasFillMode_t uplo, + cudaStream_t stream, math_t eps = -1) { + // The matrix A' is defined as: + // A' = [[A_11, A_12] + // [A_21, A_22]] + // where: + // - A_11 = A, matrix of size (n-1)x(n-1) + // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements + // - A_22 = A_new[n-1] scalar. + // + // Instead of caclulating the Cholelsky decomposition of A' from scratch, + // we just update L with the new row. The new Cholesky decomposition will be + // calculated as: + // L' = [[L_11, 0] + // [L_12, L_22]] + // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and + // L_12 and L_22 are the new quantities that we need to calculate. + + // We need a workspace in device memory to store a scalar. Additionally, in + // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. + const int align = 256; + int offset = (uplo == CUBLAS_FILL_MODE_LOWER) + ? raft::alignTo(sizeof(math_t) * (n - 1), align) + : 0; + if (workspace == nullptr) { + *n_bytes = offset + 1 * sizeof(math_t); + return; + } + math_t *s = reinterpret_cast(((char *)workspace) + offset); + math_t *L_22 = L + (n - 1) * ld + n - 1; + + math_t *A_new; + math_t *A_row; + if (uplo == CUBLAS_FILL_MODE_UPPER) { + // A_new is stored as the n-1 th column of L + A_new = L + (n - 1) * ld; + } else { + // If the input is lower triangular, then the new elements of A are stored + // as the n-th row of L. Since the matrix is column major, this is non + // contiguous. We copy elements from A_row to a contiguous workspace A_new. + A_row = L + n - 1; + A_new = reinterpret_cast(workspace); + CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, + A_row, ld, A_new, 1, stream)); + } + cublasOperation_t op = + (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + if (n > 1) { + // Calculate L_12 = x by solving equation L_11 x = A_12 + math_t alpha = 1; + CUBLAS_CHECK(raft::linalg::cublastrsm( + handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, + CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); + + // A_new now stores L_12, we calculate s = L_12 * L_12 + CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, + A_new, 1, A_new, 1, s, stream)); + + if (uplo == CUBLAS_FILL_MODE_LOWER) { + // Copy back the L_12 elements as the n-th row of L + CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, + A_new, 1, A_row, ld, stream)); + } + } else { // n == 1 case + CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); + } + + // L_22 = sqrt(A_22 - L_12 * L_12) + math_t s_host; + math_t L_22_host; + raft::update_host(&s_host, s, 1, stream); + raft::update_host(&L_22_host, L_22, 1, stream); // L_22 stores A_22 + CUDA_CHECK(cudaStreamSynchronize(stream)); + L_22_host = std::sqrt(L_22_host - s_host); + + // Check for numeric error with sqrt. If the matrix is not positive definit or + // the system is very ill conditioned then the A_22 - L_12 * L_12 can be + // negative, which would result L_22 = NaN. A small positive eps parameter + // can be used to prevent this. + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { + L_22_host = eps; + } + ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); + raft::update_device(L_22, &L_22_host, 1, stream); +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh new file mode 100644 index 0000000000..f9c5223bdc --- /dev/null +++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #pragma once + +#include +#include + + namespace raft { + namespace linalg { + namespace detail { + +// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension +// of the matrix, i.e. reduce along rows for row major or reduce along columns +// for column major layout. Kernel does an inplace reduction adding to original +// values of dots. +template +__global__ void coalescedReductionKernel(OutType *dots, const InType *data, + int D, int N, OutType init, + MainLambda main_op, + ReduceLambda reduce_op, + FinalLambda final_op, + bool inplace = false) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + OutType thread_data = init; + IdxType rowStart = blockIdx.x * D; + for (IdxType i = threadIdx.x; i < D; i += TPB) { + IdxType idx = rowStart + i; + thread_data = reduce_op(thread_data, main_op(data[idx], i)); + } + OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op); + if (threadIdx.x == 0) { + if (inplace) { + dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc)); + } else { + dots[blockIdx.x] = final_op(acc); + } + } +} + +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType *dots, const InType *data, int D, int N, + OutType init, cudaStream_t stream, bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) { + // One block per reduction + // Efficient only for large leading dimensions + if (D <= 32) { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } else if (D <= 64) { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } else if (D <= 128) { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } else { + coalescedReductionKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + + } // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh new file mode 100644 index 0000000000..2e4657ebc3 --- /dev/null +++ b/cpp/include/raft/linalg/detail/contractions.cuh @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #pragma once + + #include + + namespace raft { + namespace linalg { + namespace detail { + + template +struct Contractions_NT { +protected: +typedef Policy P; + +/** number of rows in X */ +IdxT m; +/** number of rows in Y */ +IdxT n; +/** number of columns in X and Y */ +IdxT k; +/** leading dimension in X */ +IdxT lda; +/** leading dimension in Y */ +IdxT ldb; +/** leading dimension in Output D */ +IdxT ldd; + +/** current thread's global mem row id for X data */ +IdxT xrowid; +/** current thread's global mem row id for Y data */ +IdxT yrowid; +/** global memory pointer to X matrix */ +const DataT* x; +/** global memory pointer to Y matrix */ +const DataT* y; + +/** current thread's smem row id */ +int srowid; +/** current thread's smem column id */ +int scolid; +/** current thread's accumulation row id */ +int accrowid; +/** current thread's accumulation column id */ +int acccolid; + +/** base smem pointer for X data storage */ +DataT* sx; +/** base smem pointer for Y data storage */ +DataT* sy; +/** index pointing the correct smem page for writing after `ldgXY()` */ +int pageWr; +/** index pointing the correct smem page for reading during `ldsXY()` */ +int pageRd; + +/** block of X data loaded from smem after `ldsXY()` */ +DataT regx[P::AccRowsPerTh][P::Veclen]; +/** block of Y data loaded from smem after `ldsXY()` */ +DataT regy[P::AccColsPerTh][P::Veclen]; +/** block of X data loaded from global mem after `ldgXY()` */ +DataT ldgDataX[P::LdgPerThX][P::Veclen]; +/** block of Y data loaded from global mem after `ldgXY()` */ +DataT ldgDataY[P::LdgPerThY][P::Veclen]; + +static const DataT Zero = (DataT)0; + +public: +/** +* @brief Ctor +* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] +* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] +* @param[in] _m number of rows of X +* @param[in] _n number of rows of Y +* @param[in] _k number of cols of X and Y +* @param[in] _smem shared memory region used during computations +*/ +DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, char* _smem) +: m(_m), +n(_n), +k(_k), +lda(_k), +ldb(_k), +xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow), +yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow), +x(_x + xrowid * lda), +y(_y + yrowid * ldb), +srowid(threadIdx.x / P::LdgThRow), +scolid((threadIdx.x % P::LdgThRow) * P::Veclen), +accrowid(threadIdx.x / P::AccThCols), +acccolid(threadIdx.x % P::AccThCols), +sx((DataT*)_smem), +sy(&(sx[P::SmemPageX])), +pageWr(0), +pageRd(0) {} + +/** +* @brief Ctor +* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] +* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] +* @param[in] _m number of rows of X +* @param[in] _n number of rows of Y +* @param[in] _k number of cols of X and Y +* @param[in] _smem shared memory region used during computations +*/ +DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) +: m(_m), +n(_n), +k(_k), +lda(_lda), +ldb(_ldb), +ldd(_ldd), +srowid(threadIdx.x / P::LdgThRow), +scolid((threadIdx.x % P::LdgThRow) * P::Veclen), +accrowid(threadIdx.x / P::AccThCols), +acccolid(threadIdx.x % P::AccThCols), +sx((DataT*)_smem), +sy(&(sx[P::SmemPageX])), +pageWr(0), +pageRd(0) { +if (isRowMajor) { +xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; +yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; +x = _x + xrowid * lda; +y = _y + yrowid * ldb; +} else { +xrowid = IdxT(blockIdx.y) * P::Mblk; +yrowid = IdxT(blockIdx.x) * P::Nblk; +x = _x + xrowid + srowid * lda; +y = _y + yrowid + srowid * ldb; +} +} + +protected: +/** +* @brief Load current block of X/Y from global memory to registers +* @param[in] kidx current start index of k to be loaded +*/ +DI void ldgXY(IdxT kidx) { +ldgX(kidx); +ldgY(kidx); +} + +/** +* @brief Store current block of X/Y from registers to smem +* @param[in] kidx current start index of k to be loaded +*/ +DI void stsXY() { +stsX(sx + pageWr * P::SmemPage); +stsY(sy + pageWr * P::SmemPage); +} + +/** +* @brief Load X and Y block from shared memory to registers +* @param[in] kidx k value from the current k-block to be loaded from smem +*/ +DI void ldsXY(int kidx) { +ldsX(kidx, sx + pageRd * P::SmemPage); +ldsY(kidx, sy + pageRd * P::SmemPage); +} + +private: +DI void ldgX(IdxT kidx) { +if (isRowMajor) { +auto numRows = m; +auto koffset = kidx + scolid; +#pragma unroll +for (int i = 0; i < P::LdgPerThX; ++i) { + if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) { + ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset); + } else { +#pragma unroll + for (int j = 0; j < P::Veclen; ++j) { + ldgDataX[i][j] = Zero; + } + } +} +} else { +const auto numRows = k; +auto koffset = scolid; +#pragma unroll +for (int i = 0; i < P::LdgPerThX; ++i) { + if ((koffset + xrowid) < lda && + (srowid + kidx + i * P::LdgRowsX) < numRows) { + ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); + } else { +#pragma unroll + for (int j = 0; j < P::Veclen; ++j) { + ldgDataX[i][j] = Zero; + } + } +} +} +} + +DI void ldgY(IdxT kidx) { +if (isRowMajor) { +auto numRows = n; +auto koffset = kidx + scolid; +#pragma unroll +for (int i = 0; i < P::LdgPerThY; ++i) { + if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) { + ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset); + } else { +#pragma unroll + for (int j = 0; j < P::Veclen; ++j) { + ldgDataY[i][j] = Zero; + } + } +} +} else { +auto numRows = k; +auto koffset = scolid; +#pragma unroll +for (int i = 0; i < P::LdgPerThY; ++i) { + if ((koffset + yrowid) < ldb && + (srowid + kidx + i * P::LdgRowsY) < numRows) { + ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); + } else { +#pragma unroll + for (int j = 0; j < P::Veclen; ++j) { + ldgDataY[i][j] = Zero; + } + } +} +} +} + +DI void stsX(DataT* smem) { +auto* saddr = smem + srowid * P::SmemStride + scolid; +#pragma unroll +for (int i = 0; i < P::LdgPerThX; ++i) { +sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]); +} +} + +DI void stsY(DataT* smem) { +auto* saddr = smem + srowid * P::SmemStride + scolid; +#pragma unroll +for (int i = 0; i < P::LdgPerThY; ++i) { +sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]); +} +} + +DI void ldsX(int kidx, DataT* smem) { +if (isRowMajor) { +auto* saddr = smem + accrowid * P::SmemStride + kidx; +#pragma unroll +for (int i = 0; i < P::AccRowsPerTh; ++i) { + lds(regx[i], saddr + i * P::AccThRows * P::SmemStride); +} +} else { +auto* saddr = smem + accrowid + kidx * P::SmemStride; +#pragma unroll +for (int i = 0; i < P::AccRowsPerTh; ++i) { +#pragma unroll + for (int v = 0; v < P::Veclen; ++v) { + regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride]; + } +} +} +} + +DI void ldsY(int kidx, DataT* smem) { +if (isRowMajor) { +auto* saddr = smem + acccolid * P::SmemStride + kidx; +#pragma unroll +for (int i = 0; i < P::AccColsPerTh; ++i) { + lds(regy[i], saddr + i * P::AccThCols * P::SmemStride); +} +} else { +auto* saddr = smem + acccolid + kidx * P::SmemStride; +#pragma unroll +for (int i = 0; i < P::AccColsPerTh; ++i) { +#pragma unroll + for (int v = 0; v < P::Veclen; ++v) { + regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride]; + } +} +} +} + +}; // struct Contractions_NT + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp new file mode 100644 index 0000000000..71cea65c3c --- /dev/null +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, + int n_cols, math_t *eig_vectors, math_t *eig_vals, + cudaStream_t stream) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int lwork; + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, in, + n_cols, eig_vals, &lwork)); + + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar d_dev_info(stream); + + raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); + + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, + n_cols, eig_vals, d_work.data(), lwork, + d_dev_info.data(), stream)); + CUDA_CHECK(cudaGetLastError()); + + int dev_info = d_dev_info.value(stream); + ASSERT(dev_info == 0, + "eig.cuh: eigensolver couldn't converge to a solution. " + "This usually occurs when some of the features do not vary enough."); +} + +enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; + +#if CUDART_VERSION >= 10010 + +/** + * @defgroup eig decomp with divide and conquer method for the column-major + * symmetric matrices + * @param handle raft handle + * @param in the input buffer (symmetric matrix that has real eig values and + * vectors. + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param n_eig_vals: number of eigenvectors to be generated + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param stream cuda stream + * @{ + */ +template +void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, + int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, + EigVecMemUsage memUsage, cudaStream_t stream) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int lwork; + int h_meig; + + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), + n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); + + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar d_dev_info(stream); + rmm::device_uvector d_eig_vectors(0, stream); + + if (memUsage == OVERWRITE_INPUT) { + CUSOLVER_CHECK(cusolverDnsyevdx( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), + n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, + d_dev_info.data(), stream)); + } else if (memUsage == COPY_INPUT) { + d_eig_vectors.resize(n_rows * n_cols, stream); + raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); + + CUSOLVER_CHECK(cusolverDnsyevdx( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), + math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, + d_work.data(), lwork, d_dev_info.data(), stream)); + } + + CUDA_CHECK(cudaGetLastError()); + + int dev_info = d_dev_info.value(stream); + ASSERT(dev_info == 0, + "eig.cuh: eigensolver couldn't converge to a solution. " + "This usually occurs when some of the features do not vary enough."); + + if (memUsage == OVERWRITE_INPUT) { + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, + stream); + } else if (memUsage == COPY_INPUT) { + raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, + n_rows, n_eig_vals, stream); + } +} + +#endif + +template +void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, + int n_cols, math_t *eig_vectors, math_t *eig_vals, + cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + syevjInfo_t syevj_params = nullptr; + CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); + CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); + CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); + + int lwork; + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, + eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); + + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar dev_info(stream); + + raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); + + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, + n_cols, eig_vals, d_work.data(), lwork, + dev_info.data(), syevj_params, stream)); + + int executed_sweeps; + CUSOLVER_CHECK( + cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + + CUDA_CHECK(cudaGetLastError()); + CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh new file mode 100644 index 0000000000..275e5f5917 --- /dev/null +++ b/cpp/include/raft/linalg/detail/functional.cuh @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #pragma once + + #include + + namespace raft { + namespace linalg { + namespace detail { + +template +struct divides_scalar { + +public: + divides_scalar(ArgType scalar) : scalar_(scalar) {} + + __host__ __device__ inline ReturnType operator()(ArgType in) { + return in / scalar_; + } + +private: + ArgType scalar_; +}; + +template +struct adds_scalar { + +public: + adds_scalar(ArgType scalar) : scalar_(scalar) {} + + __host__ __device__ inline ReturnType operator()(ArgType in) { + return in + scalar_; + } + +private: + ArgType scalar_; +}; + +template +struct multiplies_scalar { + +public: + multiplies_scalar(ArgType scalar) : scalar_(scalar) {} + + __host__ __device__ inline ReturnType operator()(ArgType in) { + return in * scalar_; + } + +private: + ArgType scalar_; +}; + +template +struct divides_check_zero { + +public: + __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) { + return (b == static_cast(0)) ? 0.0 : a / b; + } + +}; + + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index c848ac1f4b..a2d8d67c30 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -16,11 +16,14 @@ #pragma once +#include "detail/functional.cuh" #include "unary_op.cuh" namespace raft { namespace linalg { +using detail::divides_scalar; + /** * @defgroup ScalarOps Scalar operations on the input buffer * @tparam math_t data-type upon which the math operation will be performed @@ -36,7 +39,7 @@ template void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cudaStream_t stream) { unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, + out, in, len, divides_scalar(scalar), stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 5b2df3bcb3..296f916469 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -16,14 +16,7 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include +#include "detail/eig.hpp" namespace raft { namespace linalg { @@ -45,28 +38,7 @@ template void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, in, - n_cols, eig_vals, &lwork)); - - rmm::device_uvector d_work(lwork, stream); - rmm::device_scalar d_dev_info(stream); - - raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); - CUDA_CHECK(cudaGetLastError()); - - int dev_info = d_dev_info.value(stream); - ASSERT(dev_info == 0, - "eig.cuh: eigensolver couldn't converge to a solution. " - "This usually occurs when some of the features do not vary enough."); + detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); } enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; @@ -91,51 +63,7 @@ template void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, EigVecMemUsage memUsage, cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - int lwork; - int h_meig; - - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); - - rmm::device_uvector d_work(lwork, stream); - rmm::device_scalar d_dev_info(stream); - rmm::device_uvector d_eig_vectors(0, stream); - - if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); - } else if (memUsage == COPY_INPUT) { - d_eig_vectors.resize(n_rows * n_cols, stream); - raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), - math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, - d_work.data(), lwork, d_dev_info.data(), stream)); - } - - CUDA_CHECK(cudaGetLastError()); - - int dev_info = d_dev_info.value(stream); - ASSERT(dev_info == 0, - "eig.cuh: eigensolver couldn't converge to a solution. " - "This usually occurs when some of the features do not vary enough."); - - if (memUsage == OVERWRITE_INPUT) { - raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, - stream); - } else if (memUsage == COPY_INPUT) { - raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, - n_rows, n_eig_vals, stream); - } + detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream); } #endif @@ -158,34 +86,7 @@ template void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - syevjInfo_t syevj_params = nullptr; - CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); - CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); - - int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, - eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); - - rmm::device_uvector d_work(lwork, stream); - rmm::device_scalar dev_info(stream); - - raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - dev_info.data(), syevj_params, stream)); - - int executed_sweeps; - CUSOLVER_CHECK( - cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); - - CUDA_CHECK(cudaGetLastError()); - CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); + detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh index 1c6dee562d..b7c9619c4e 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -16,12 +16,16 @@ #pragma once +#include "detail/functional.cuh" + #include "binary_op.cuh" #include "unary_op.cuh" namespace raft { namespace linalg { +using detail::adds_scalar; + /** * @defgroup ScalarOps Scalar operations on the input buffer * @tparam InType data-type upon which the math operation will be performed @@ -37,15 +41,17 @@ template void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, cudaStream_t stream) { raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, + out, in, len, adds_scalar(scalar), stream); } +using detail::multiplies_scalar; + template void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, cudaStream_t stream) { raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, + out, in, len, multiplies_scalar(scalar), stream); } /** @} */ @@ -65,7 +71,7 @@ template void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, + out, in1, in2, len, thrust::plus(), stream); } @@ -73,7 +79,7 @@ template void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, + out, in1, in2, len, thrust::minus(), stream); } @@ -81,7 +87,7 @@ template void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, + out, in1, in2, len, thrust::multiplies(), stream); } @@ -89,21 +95,18 @@ template void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, + out, in1, in2, len, thrust::divides(), stream); } +using detail::divides_check_zero; + template void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { binaryOp( out, in1, in2, len, - [] __device__(InType a, InType b) { - if (b == InType(0.0)) - return InType(0.0); - else - return a / b; - }, + divides_check_zero(), stream); } /** @} */ From 788ffa864f501efee64904d833a9c9656e2dae57 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 16 Nov 2021 18:00:54 -0800 Subject: [PATCH 03/17] style fix --- .../raft/linalg/cholesky_r1_update.hpp | 3 +- .../raft/linalg/coalesced_reduction.hpp | 3 +- cpp/include/raft/linalg/detail/add.cuh | 30 +- cpp/include/raft/linalg/detail/binary_op.cuh | 30 +- .../raft/linalg/detail/cholesky_r1_update.hpp | 16 +- .../linalg/detail/coalesced_reduction.cuh | 14 +- .../raft/linalg/detail/contractions.cuh | 432 +++++++++--------- cpp/include/raft/linalg/detail/eig.hpp | 6 +- cpp/include/raft/linalg/detail/functional.cuh | 72 ++- cpp/include/raft/linalg/detail/gemm.hpp | 6 +- cpp/include/raft/linalg/detail/lanczos.hpp | 2 +- cpp/include/raft/linalg/detail/map.cuh | 69 ++- .../raft/linalg/detail/map_then_reduce.cuh | 48 +- .../raft/linalg/detail/matrix_vector_op.cuh | 355 +++++++------- cpp/include/raft/linalg/detail/qr.cuh | 183 ++++---- .../raft/linalg/detail/strided_reduction.cuh | 249 +++++----- cpp/include/raft/linalg/detail/subtract.cuh | 71 ++- cpp/include/raft/linalg/detail/svd.cuh | 316 ++++++------- cpp/include/raft/linalg/detail/unary_op.cuh | 193 ++++---- cpp/include/raft/linalg/divide.hpp | 4 +- cpp/include/raft/linalg/eig.hpp | 8 +- cpp/include/raft/linalg/eltwise.hpp | 31 +- cpp/include/raft/linalg/gemm.hpp | 6 +- cpp/include/raft/linalg/lanczos.hpp | 20 +- cpp/include/raft/linalg/map.hpp | 4 +- cpp/include/raft/linalg/map_then_reduce.hpp | 5 +- cpp/include/raft/linalg/matrix_vector_op.hpp | 6 +- cpp/include/raft/linalg/strided_reduction.hpp | 3 +- cpp/include/raft/linalg/svd.hpp | 11 +- cpp/test/linalg/eig_sel.cu | 6 +- 30 files changed, 1099 insertions(+), 1103 deletions(-) diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp index e72d3e963f..9f669a5058 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp @@ -121,7 +121,8 @@ template void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, void *workspace, int *n_bytes, cublasFillMode_t uplo, cudaStream_t stream, math_t eps = -1) { - detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps); + detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, + stream, eps); } }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp index 591d605cb2..ad5279b1ad 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.hpp +++ b/cpp/include/raft/linalg/coalesced_reduction.hpp @@ -57,7 +57,8 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N, MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, + reduce_op, final_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index 9c24514449..be7b8bb299 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -24,26 +24,26 @@ namespace detail { template __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { -IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; -if (i < len) { + const math_t *singleScalarDev, + IdxType len) { + IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; -} + } } template void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { -// TODO: block dimension has not been tuned -dim3 block(256); -dim3 grid(raft::ceildiv(len, (IdxType)block.x)); -add_dev_scalar_kernel + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { + // TODO: block dimension has not been tuned + dim3 block(256); + dim3 grid(raft::ceildiv(len, (IdxType)block.x)); + add_dev_scalar_kernel <<>>(outDev, inDev, singleScalarDev, len); -CUDA_CHECK(cudaPeekAtLastError()); + CUDA_CHECK(cudaPeekAtLastError()); } -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh index c0b670caf2..969c9dfa3e 100644 --- a/cpp/include/raft/linalg/detail/binary_op.cuh +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -14,15 +14,15 @@ * limitations under the License. */ - #pragma once +#pragma once - #include +#include - namespace raft { - namespace linalg { - namespace detail { +namespace raft { +namespace linalg { +namespace detail { - template __global__ void binaryOpKernel(OutType *out, const InType *in1, const InType *in2, IdxType len, Lambda op) { @@ -73,19 +73,19 @@ void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, uint64_t outAddr = uint64_t(out); if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { - binaryOpImpl( + binaryOpImpl( out, in1, in2, len, op, stream); } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { - binaryOpImpl( + binaryOpImpl( out, in1, in2, len, op, stream); } else if (4 / maxSize && bytes % 4 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 4)) { - binaryOpImpl( + addressAligned(in1Addr, in2Addr, outAddr, 4)) { + binaryOpImpl( out, in1, in2, len, op, stream); } else if (2 / maxSize && bytes % 2 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 2)) { - binaryOpImpl( + addressAligned(in1Addr, in2Addr, outAddr, 2)) { + binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { binaryOpImpl( @@ -96,6 +96,6 @@ void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, } } - } // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp index beb3056e6d..49bb190836 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp @@ -28,8 +28,8 @@ namespace detail { template void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { + void *workspace, int *n_bytes, cublasFillMode_t uplo, + cudaStream_t stream, math_t eps = -1) { // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -50,8 +50,8 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; + ? raft::alignTo(sizeof(math_t) * (n - 1), align) + : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; @@ -84,7 +84,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // A_new now stores L_12, we calculate s = L_12 * L_12 CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); + A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L @@ -114,6 +114,6 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, raft::update_device(L_22, &L_22_host, 1, stream); } -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh index f9c5223bdc..253b7032ed 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh @@ -14,14 +14,14 @@ * limitations under the License. */ - #pragma once +#pragma once #include #include - namespace raft { - namespace linalg { - namespace detail { +namespace raft { +namespace linalg { +namespace detail { // Kernel (based on norm.cuh) to perform reductions along the coalesced dimension // of the matrix, i.e. reduce along rows for row major or reduce along columns @@ -84,6 +84,6 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N, CUDA_CHECK(cudaPeekAtLastError()); } - } // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh index 2e4657ebc3..b04c813cd8 100644 --- a/cpp/include/raft/linalg/detail/contractions.cuh +++ b/cpp/include/raft/linalg/detail/contractions.cuh @@ -14,73 +14,73 @@ * limitations under the License. */ - #pragma once +#pragma once - #include +#include - namespace raft { - namespace linalg { - namespace detail { +namespace raft { +namespace linalg { +namespace detail { - template +template struct Contractions_NT { -protected: -typedef Policy P; + protected: + typedef Policy P; -/** number of rows in X */ -IdxT m; -/** number of rows in Y */ -IdxT n; -/** number of columns in X and Y */ -IdxT k; -/** leading dimension in X */ -IdxT lda; -/** leading dimension in Y */ -IdxT ldb; -/** leading dimension in Output D */ -IdxT ldd; + /** number of rows in X */ + IdxT m; + /** number of rows in Y */ + IdxT n; + /** number of columns in X and Y */ + IdxT k; + /** leading dimension in X */ + IdxT lda; + /** leading dimension in Y */ + IdxT ldb; + /** leading dimension in Output D */ + IdxT ldd; -/** current thread's global mem row id for X data */ -IdxT xrowid; -/** current thread's global mem row id for Y data */ -IdxT yrowid; -/** global memory pointer to X matrix */ -const DataT* x; -/** global memory pointer to Y matrix */ -const DataT* y; + /** current thread's global mem row id for X data */ + IdxT xrowid; + /** current thread's global mem row id for Y data */ + IdxT yrowid; + /** global memory pointer to X matrix */ + const DataT* x; + /** global memory pointer to Y matrix */ + const DataT* y; -/** current thread's smem row id */ -int srowid; -/** current thread's smem column id */ -int scolid; -/** current thread's accumulation row id */ -int accrowid; -/** current thread's accumulation column id */ -int acccolid; + /** current thread's smem row id */ + int srowid; + /** current thread's smem column id */ + int scolid; + /** current thread's accumulation row id */ + int accrowid; + /** current thread's accumulation column id */ + int acccolid; -/** base smem pointer for X data storage */ -DataT* sx; -/** base smem pointer for Y data storage */ -DataT* sy; -/** index pointing the correct smem page for writing after `ldgXY()` */ -int pageWr; -/** index pointing the correct smem page for reading during `ldsXY()` */ -int pageRd; + /** base smem pointer for X data storage */ + DataT* sx; + /** base smem pointer for Y data storage */ + DataT* sy; + /** index pointing the correct smem page for writing after `ldgXY()` */ + int pageWr; + /** index pointing the correct smem page for reading during `ldsXY()` */ + int pageRd; -/** block of X data loaded from smem after `ldsXY()` */ -DataT regx[P::AccRowsPerTh][P::Veclen]; -/** block of Y data loaded from smem after `ldsXY()` */ -DataT regy[P::AccColsPerTh][P::Veclen]; -/** block of X data loaded from global mem after `ldgXY()` */ -DataT ldgDataX[P::LdgPerThX][P::Veclen]; -/** block of Y data loaded from global mem after `ldgXY()` */ -DataT ldgDataY[P::LdgPerThY][P::Veclen]; + /** block of X data loaded from smem after `ldsXY()` */ + DataT regx[P::AccRowsPerTh][P::Veclen]; + /** block of Y data loaded from smem after `ldsXY()` */ + DataT regy[P::AccColsPerTh][P::Veclen]; + /** block of X data loaded from global mem after `ldgXY()` */ + DataT ldgDataX[P::LdgPerThX][P::Veclen]; + /** block of Y data loaded from global mem after `ldgXY()` */ + DataT ldgDataY[P::LdgPerThY][P::Veclen]; -static const DataT Zero = (DataT)0; + static const DataT Zero = (DataT)0; -public: -/** + public: + /** * @brief Ctor * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] @@ -89,27 +89,27 @@ public: * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ -DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) -: m(_m), -n(_n), -k(_k), -lda(_k), -ldb(_k), -xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow), -yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow), -x(_x + xrowid * lda), -y(_y + yrowid * ldb), -srowid(threadIdx.x / P::LdgThRow), -scolid((threadIdx.x % P::LdgThRow) * P::Veclen), -accrowid(threadIdx.x / P::AccThCols), -acccolid(threadIdx.x % P::AccThCols), -sx((DataT*)_smem), -sy(&(sx[P::SmemPageX])), -pageWr(0), -pageRd(0) {} + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, char* _smem) + : m(_m), + n(_n), + k(_k), + lda(_k), + ldb(_k), + xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow), + yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow), + x(_x + xrowid * lda), + y(_y + yrowid * ldb), + srowid(threadIdx.x / P::LdgThRow), + scolid((threadIdx.x % P::LdgThRow) * P::Veclen), + accrowid(threadIdx.x / P::AccThCols), + acccolid(threadIdx.x % P::AccThCols), + sx((DataT*)_smem), + sy(&(sx[P::SmemPageX])), + pageWr(0), + pageRd(0) {} -/** + /** * @brief Ctor * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] @@ -118,186 +118,186 @@ pageRd(0) {} * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ -DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) -: m(_m), -n(_n), -k(_k), -lda(_lda), -ldb(_ldb), -ldd(_ldd), -srowid(threadIdx.x / P::LdgThRow), -scolid((threadIdx.x % P::LdgThRow) * P::Veclen), -accrowid(threadIdx.x / P::AccThCols), -acccolid(threadIdx.x % P::AccThCols), -sx((DataT*)_smem), -sy(&(sx[P::SmemPageX])), -pageWr(0), -pageRd(0) { -if (isRowMajor) { -xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; -yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; -x = _x + xrowid * lda; -y = _y + yrowid * ldb; -} else { -xrowid = IdxT(blockIdx.y) * P::Mblk; -yrowid = IdxT(blockIdx.x) * P::Nblk; -x = _x + xrowid + srowid * lda; -y = _y + yrowid + srowid * ldb; -} -} + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) + : m(_m), + n(_n), + k(_k), + lda(_lda), + ldb(_ldb), + ldd(_ldd), + srowid(threadIdx.x / P::LdgThRow), + scolid((threadIdx.x % P::LdgThRow) * P::Veclen), + accrowid(threadIdx.x / P::AccThCols), + acccolid(threadIdx.x % P::AccThCols), + sx((DataT*)_smem), + sy(&(sx[P::SmemPageX])), + pageWr(0), + pageRd(0) { + if (isRowMajor) { + xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; + yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; + } else { + xrowid = IdxT(blockIdx.y) * P::Mblk; + yrowid = IdxT(blockIdx.x) * P::Nblk; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; + } + } -protected: -/** + protected: + /** * @brief Load current block of X/Y from global memory to registers * @param[in] kidx current start index of k to be loaded */ -DI void ldgXY(IdxT kidx) { -ldgX(kidx); -ldgY(kidx); -} + DI void ldgXY(IdxT kidx) { + ldgX(kidx); + ldgY(kidx); + } -/** + /** * @brief Store current block of X/Y from registers to smem * @param[in] kidx current start index of k to be loaded */ -DI void stsXY() { -stsX(sx + pageWr * P::SmemPage); -stsY(sy + pageWr * P::SmemPage); -} + DI void stsXY() { + stsX(sx + pageWr * P::SmemPage); + stsY(sy + pageWr * P::SmemPage); + } -/** + /** * @brief Load X and Y block from shared memory to registers * @param[in] kidx k value from the current k-block to be loaded from smem */ -DI void ldsXY(int kidx) { -ldsX(kidx, sx + pageRd * P::SmemPage); -ldsY(kidx, sy + pageRd * P::SmemPage); -} + DI void ldsXY(int kidx) { + ldsX(kidx, sx + pageRd * P::SmemPage); + ldsY(kidx, sy + pageRd * P::SmemPage); + } -private: -DI void ldgX(IdxT kidx) { -if (isRowMajor) { -auto numRows = m; -auto koffset = kidx + scolid; + private: + DI void ldgX(IdxT kidx) { + if (isRowMajor) { + auto numRows = m; + auto koffset = kidx + scolid; #pragma unroll -for (int i = 0; i < P::LdgPerThX; ++i) { - if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) { - ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset); - } else { + for (int i = 0; i < P::LdgPerThX; ++i) { + if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) { + ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset); + } else { #pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataX[i][j] = Zero; - } - } -} -} else { -const auto numRows = k; -auto koffset = scolid; + for (int j = 0; j < P::Veclen; ++j) { + ldgDataX[i][j] = Zero; + } + } + } + } else { + const auto numRows = k; + auto koffset = scolid; #pragma unroll -for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { - ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); - } else { + for (int i = 0; i < P::LdgPerThX; ++i) { + if ((koffset + xrowid) < lda && + (srowid + kidx + i * P::LdgRowsX) < numRows) { + ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); + } else { #pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataX[i][j] = Zero; + for (int j = 0; j < P::Veclen; ++j) { + ldgDataX[i][j] = Zero; + } + } + } } } -} -} -} -DI void ldgY(IdxT kidx) { -if (isRowMajor) { -auto numRows = n; -auto koffset = kidx + scolid; + DI void ldgY(IdxT kidx) { + if (isRowMajor) { + auto numRows = n; + auto koffset = kidx + scolid; #pragma unroll -for (int i = 0; i < P::LdgPerThY; ++i) { - if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) { - ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset); - } else { + for (int i = 0; i < P::LdgPerThY; ++i) { + if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) { + ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset); + } else { #pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataY[i][j] = Zero; - } - } -} -} else { -auto numRows = k; -auto koffset = scolid; + for (int j = 0; j < P::Veclen; ++j) { + ldgDataY[i][j] = Zero; + } + } + } + } else { + auto numRows = k; + auto koffset = scolid; #pragma unroll -for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { - ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); - } else { + for (int i = 0; i < P::LdgPerThY; ++i) { + if ((koffset + yrowid) < ldb && + (srowid + kidx + i * P::LdgRowsY) < numRows) { + ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); + } else { #pragma unroll - for (int j = 0; j < P::Veclen; ++j) { - ldgDataY[i][j] = Zero; + for (int j = 0; j < P::Veclen; ++j) { + ldgDataY[i][j] = Zero; + } + } + } } } -} -} -} -DI void stsX(DataT* smem) { -auto* saddr = smem + srowid * P::SmemStride + scolid; + DI void stsX(DataT* smem) { + auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll -for (int i = 0; i < P::LdgPerThX; ++i) { -sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]); -} -} + for (int i = 0; i < P::LdgPerThX; ++i) { + sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]); + } + } -DI void stsY(DataT* smem) { -auto* saddr = smem + srowid * P::SmemStride + scolid; + DI void stsY(DataT* smem) { + auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll -for (int i = 0; i < P::LdgPerThY; ++i) { -sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]); -} -} + for (int i = 0; i < P::LdgPerThY; ++i) { + sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]); + } + } -DI void ldsX(int kidx, DataT* smem) { -if (isRowMajor) { -auto* saddr = smem + accrowid * P::SmemStride + kidx; + DI void ldsX(int kidx, DataT* smem) { + if (isRowMajor) { + auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll -for (int i = 0; i < P::AccRowsPerTh; ++i) { - lds(regx[i], saddr + i * P::AccThRows * P::SmemStride); -} -} else { -auto* saddr = smem + accrowid + kidx * P::SmemStride; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + lds(regx[i], saddr + i * P::AccThRows * P::SmemStride); + } + } else { + auto* saddr = smem + accrowid + kidx * P::SmemStride; #pragma unroll -for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride]; + for (int v = 0; v < P::Veclen; ++v) { + regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride]; + } + } + } } -} -} -} -DI void ldsY(int kidx, DataT* smem) { -if (isRowMajor) { -auto* saddr = smem + acccolid * P::SmemStride + kidx; + DI void ldsY(int kidx, DataT* smem) { + if (isRowMajor) { + auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll -for (int i = 0; i < P::AccColsPerTh; ++i) { - lds(regy[i], saddr + i * P::AccThCols * P::SmemStride); -} -} else { -auto* saddr = smem + acccolid + kidx * P::SmemStride; + for (int i = 0; i < P::AccColsPerTh; ++i) { + lds(regy[i], saddr + i * P::AccThCols * P::SmemStride); + } + } else { + auto* saddr = smem + acccolid + kidx * P::SmemStride; #pragma unroll -for (int i = 0; i < P::AccColsPerTh; ++i) { + for (int i = 0; i < P::AccColsPerTh; ++i) { #pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride]; + for (int v = 0; v < P::Veclen; ++v) { + regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride]; + } + } + } } -} -} -} }; // struct Contractions_NT -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index 9e0966e67b..c37f3c92a5 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -202,6 +202,6 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); } -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh index 275e5f5917..fec2e27228 100644 --- a/cpp/include/raft/linalg/detail/functional.cuh +++ b/cpp/include/raft/linalg/detail/functional.cuh @@ -14,67 +14,61 @@ * limitations under the License. */ - #pragma once +#pragma once - #include +#include - namespace raft { - namespace linalg { - namespace detail { +namespace raft { +namespace linalg { +namespace detail { template struct divides_scalar { + public: + divides_scalar(ArgType scalar) : scalar_(scalar) {} -public: - divides_scalar(ArgType scalar) : scalar_(scalar) {} + __host__ __device__ inline ReturnType operator()(ArgType in) { + return in / scalar_; + } - __host__ __device__ inline ReturnType operator()(ArgType in) { - return in / scalar_; - } - -private: - ArgType scalar_; + private: + ArgType scalar_; }; template struct adds_scalar { + public: + adds_scalar(ArgType scalar) : scalar_(scalar) {} -public: - adds_scalar(ArgType scalar) : scalar_(scalar) {} - - __host__ __device__ inline ReturnType operator()(ArgType in) { - return in + scalar_; - } + __host__ __device__ inline ReturnType operator()(ArgType in) { + return in + scalar_; + } -private: - ArgType scalar_; + private: + ArgType scalar_; }; template struct multiplies_scalar { + public: + multiplies_scalar(ArgType scalar) : scalar_(scalar) {} -public: - multiplies_scalar(ArgType scalar) : scalar_(scalar) {} - - __host__ __device__ inline ReturnType operator()(ArgType in) { - return in * scalar_; - } + __host__ __device__ inline ReturnType operator()(ArgType in) { + return in * scalar_; + } -private: - ArgType scalar_; + private: + ArgType scalar_; }; template struct divides_check_zero { - -public: - __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) { - return (b == static_cast(0)) ? 0.0 : a / b; - } - + public: + __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) { + return (b == static_cast(0)) ? 0.0 : a / b; + } }; - -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 1a09e86532..c565d69f86 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -119,6 +119,6 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, b, ldb, &beta, c, ldc, stream)); } -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index e1672ef23c..dc75b70509 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -596,7 +596,7 @@ static int lanczosRestart( return 0; } -} // namespace detail +} // namespace detail } // namespace spectral namespace detail { diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index faa899492e..0e649fb937 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -14,39 +14,38 @@ * limitations under the License. */ - #pragma once +#pragma once - #include - #include - #include - #include - - namespace raft { - namespace linalg { - namespace detail { - - template - __global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, - Args... args) { - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - - if (idx < len) { - out[idx] = map(in[idx], args[idx]...); - } - } - - template - void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - const int nblks = raft::ceildiv(len, (size_t)TPB); - mapKernel - <<>>(out, len, map, in, args...); - CUDA_CHECK(cudaPeekAtLastError()); - } - - } // namespace detail - } // namespace linalg - }; // namespace raft - \ No newline at end of file +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, + Args... args) { + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (idx < len) { + out[idx] = map(in[idx], args[idx]...); + } +} + +template +void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, + const InType *in, Args... args) { + const int nblks = raft::ceildiv(len, (size_t)TPB); + mapKernel + <<>>(out, len, map, in, args...); + CUDA_CHECK(cudaPeekAtLastError()); +} + +} // namespace detail +} // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index 98a08713dc..a7031bc48f 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -29,51 +29,51 @@ struct sum_tag {}; template __device__ void reduce(OutType *out, const InType acc, sum_tag) { -typedef cub::BlockReduce BlockReduce; -__shared__ typename BlockReduce::TempStorage temp_storage; -OutType tmp = BlockReduce(temp_storage).Sum(acc); -if (threadIdx.x == 0) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + OutType tmp = BlockReduce(temp_storage).Sum(acc); + if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); -} + } } template __device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { -typedef cub::BlockReduce BlockReduce; -__shared__ typename BlockReduce::TempStorage temp_storage; -OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); -if (threadIdx.x == 0) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); + if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); -} + } } template + typename ReduceLambda, int TPB, typename... Args> __global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, MapOp map, ReduceLambda op, const InType *in, Args... args) { -OutType acc = neutral; -auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + OutType acc = neutral; + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); -if (idx < len) { + if (idx < len) { acc = map(in[idx], args[idx]...); -} + } -__syncthreads(); + __syncthreads(); -reduce(out, acc, op); + reduce(out, acc, op); } template + typename ReduceLambda, int TPB, typename... Args> void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { -raft::update_device(out, &neutral, 1, stream); -const int nblks = raft::ceildiv(len, (size_t)TPB); -mapThenReduceKernel + ReduceLambda op, cudaStream_t stream, const InType *in, + Args... args) { + raft::update_device(out, &neutral, 1, stream); + const int nblks = raft::ceildiv(len, (size_t)TPB); + mapThenReduceKernel <<>>(out, len, neutral, map, op, in, args...); -CUDA_CHECK(cudaPeekAtLastError()); + CUDA_CHECK(cudaPeekAtLastError()); } }; // end namespace detail diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh index d46a7833e1..17f748248b 100644 --- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -14,182 +14,181 @@ * limitations under the License. */ - #pragma once +#pragma once - #include - #include - - namespace raft { - namespace linalg { - namespace detail { - - template - __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Lambda op) { - typedef TxN_t VecType; - IdxType len = N * D; - IdxType idx = threadIdx.x; - idx += (IdxType)blockIdx.x * (IdxType)blockDim.x; - idx *= VecType::Ratio; - if (idx >= len) return; - IdxType vIdx; - VecType mat, vec; - ///@todo: yikes! use fast-int-div here. - ///@todo: shared mem for vector could help with perf - if (rowMajor && bcastAlongRows) { - vIdx = idx % D; - vec.load(vector, vIdx); - } else if (!rowMajor && !bcastAlongRows) { - vIdx = idx % N; - vec.load(vector, vIdx); - } else if (rowMajor && !bcastAlongRows) { - vIdx = idx / D; - vec.fill(vector[vIdx]); - } else { - vIdx = idx / N; - vec.fill(vector[vIdx]); - } - mat.load(matrix, idx); - #pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) - mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]); - mat.store(out, idx); - } - - template - void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType len = N * D; - IdxType nblks = - raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); - matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, - bcastAlongRows, op); - CUDA_CHECK(cudaPeekAtLastError()); - } - - template - void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, - cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; - size_t stride_bytes = stride * sizeof(Type); - - auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { - return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && - reinterpret_cast(matrix) % sizeof(Type); - }; - - if (test_aligned_access(16)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(8)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(4)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(2)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (1 / sizeof(Type)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } - } - - ///@todo: come up with a cleaner interface to support these cases in future! - - template - __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector1, const Type *vector2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op) { - typedef TxN_t VecType; - IdxType len = N * D; - IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; - if (idx >= len) return; - IdxType vIdx; - VecType mat, vec1, vec2; - ///@todo: yikes! use fast-int-div here. - ///@todo: shared mem for vector could help with perf - if (rowMajor && bcastAlongRows) { - vIdx = idx % D; - vec1.load(vector1, vIdx); - vec2.load(vector2, vIdx); - } else if (!rowMajor && !bcastAlongRows) { - vIdx = idx % N; - vec1.load(vector1, vIdx); - vec2.load(vector2, vIdx); - } else if (rowMajor && !bcastAlongRows) { - vIdx = idx / D; - vec1.fill(vector1[vIdx]); - vec2.fill(vector2[vIdx]); - } else { - vIdx = idx / N; - vec1.fill(vector1[vIdx]); - vec2.fill(vector2[vIdx]); - } - mat.load(matrix, idx); - #pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) - mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]); - mat.store(out, idx); - } - - template - void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); - matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, - bcastAlongRows, op); - CUDA_CHECK(cudaPeekAtLastError()); - } - - template - void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; - size_t stride_bytes = stride * sizeof(Type); - - auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { - return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && - reinterpret_cast(matrix) % sizeof(Type); - }; - - if (test_aligned_access(16)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(8)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(4)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(2)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (1 / sizeof(Type)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } - } - - }; // end namespace detail - }; // end namespace linalg - }; // end namespace raft - \ No newline at end of file +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, + const Type *vector, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, + Lambda op) { + typedef TxN_t VecType; + IdxType len = N * D; + IdxType idx = threadIdx.x; + idx += (IdxType)blockIdx.x * (IdxType)blockDim.x; + idx *= VecType::Ratio; + if (idx >= len) return; + IdxType vIdx; + VecType mat, vec; + ///@todo: yikes! use fast-int-div here. + ///@todo: shared mem for vector could help with perf + if (rowMajor && bcastAlongRows) { + vIdx = idx % D; + vec.load(vector, vIdx); + } else if (!rowMajor && !bcastAlongRows) { + vIdx = idx % N; + vec.load(vector, vIdx); + } else if (rowMajor && !bcastAlongRows) { + vIdx = idx / D; + vec.fill(vector[vIdx]); + } else { + vIdx = idx / N; + vec.fill(vector[vIdx]); + } + mat.load(matrix, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) + mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]); + mat.store(out, idx); +} + +template +void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType len = N * D; + IdxType nblks = + raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); + matrixVectorOpKernel + <<>>(out, matrix, vec, D, N, rowMajor, + bcastAlongRows, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, + cudaStream_t stream) { + IdxType stride = rowMajor ? D : N; + size_t stride_bytes = stride * sizeof(Type); + + auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { + return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); + }; + + if (test_aligned_access(16)) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (test_aligned_access(8)) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (test_aligned_access(4)) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (test_aligned_access(2)) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (1 / sizeof(Type)) { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } else { + matrixVectorOpImpl( + out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + } +} + +///@todo: come up with a cleaner interface to support these cases in future! + +template +__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, + const Type *vector1, const Type *vector2, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op) { + typedef TxN_t VecType; + IdxType len = N * D; + IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; + if (idx >= len) return; + IdxType vIdx; + VecType mat, vec1, vec2; + ///@todo: yikes! use fast-int-div here. + ///@todo: shared mem for vector could help with perf + if (rowMajor && bcastAlongRows) { + vIdx = idx % D; + vec1.load(vector1, vIdx); + vec2.load(vector2, vIdx); + } else if (!rowMajor && !bcastAlongRows) { + vIdx = idx % N; + vec1.load(vector1, vIdx); + vec2.load(vector2, vIdx); + } else if (rowMajor && !bcastAlongRows) { + vIdx = idx / D; + vec1.fill(vector1[vIdx]); + vec2.fill(vector2[vIdx]); + } else { + vIdx = idx / N; + vec1.fill(vector1[vIdx]); + vec2.fill(vector2[vIdx]); + } + mat.load(matrix, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) + mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]); + mat.store(out, idx); +} + +template +void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); + matrixVectorOpKernel + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, + bcastAlongRows, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType stride = rowMajor ? D : N; + size_t stride_bytes = stride * sizeof(Type); + + auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { + return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); + }; + + if (test_aligned_access(16)) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (test_aligned_access(8)) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (test_aligned_access(4)) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (test_aligned_access(2)) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else if (1 / sizeof(Type)) { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } else { + matrixVectorOpImpl( + out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + } +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh index 1f1e36f426..c6bb99f8f2 100644 --- a/cpp/include/raft/linalg/detail/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -14,96 +14,95 @@ * limitations under the License. */ - #pragma once - - #include - #include - #include - #include - #include - - namespace raft { - namespace linalg { - namespace detail { - - template - void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, +#pragma once + +#include +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, + int n_rows, int n_cols, cudaStream_t stream) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int m = n_rows, n = n_cols; + int k = min(m, n); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); + + rmm::device_uvector tau(k, stream); + CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); + + rmm::device_scalar devInfo(stream); + int Lwork; + + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); + rmm::device_uvector workspace(Lwork, stream); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), + workspace.data(), Lwork, devInfo.data(), + stream)); + /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. +#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + CUSOLVER_CHECK( + cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + workspace.resize(Lwork, stream); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), + workspace.data(), Lwork, devInfo.data(), + stream)); +} + +template +void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, int n_rows, int n_cols, cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - int m = n_rows, n = n_cols; - int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); - - rmm::device_uvector tau(k, stream); - CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); - - rmm::device_scalar devInfo(stream); - int Lwork; - - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); - rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); - /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. - #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - CUDA_CHECK(cudaDeviceSynchronize()); - #endif - CUSOLVER_CHECK( - cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); - workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); - } - - template - void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, - int n_rows, int n_cols, cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - int m = n_rows, n = n_cols; - rmm::device_uvector R_full(m * n, stream); - rmm::device_uvector tau(min(m, n), stream); - CUDA_CHECK( - cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); - int R_full_nrows = m, R_full_ncols = n; - CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); - - int Lwork; - rmm::device_scalar devInfo(stream); - - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, - R_full_ncols, R_full.data(), - R_full_nrows, &Lwork)); - rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, - tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); - // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. - #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - CUDA_CHECK(cudaDeviceSynchronize()); - #endif - - raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - - CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); - int Q_nrows = m, Q_ncols = n; - - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, - min(Q_ncols, Q_nrows), Q, Q_nrows, - tau.data(), &Lwork)); - workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( - cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), - workspace.data(), Lwork, devInfo.data(), stream)); - } - - }; // namespace detail - }; // namespace linalg - }; // namespace raft - \ No newline at end of file + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + int m = n_rows, n = n_cols; + rmm::device_uvector R_full(m * n, stream); + rmm::device_uvector tau(min(m, n), stream); + CUDA_CHECK( + cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + int R_full_nrows = m, R_full_ncols = n; + CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); + + int Lwork; + rmm::device_scalar devInfo(stream); + + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, + R_full_ncols, R_full.data(), + R_full_nrows, &Lwork)); + rmm::device_uvector workspace(Lwork, stream); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, + tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. +#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + + raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); + + CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); + int Q_nrows = m, Q_ncols = n; + + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, + min(Q_ncols, Q_nrows), Q, Q_nrows, + tau.data(), &Lwork)); + workspace.resize(Lwork, stream); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), + workspace.data(), Lwork, devInfo.data(), stream)); +} + +}; // namespace detail +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 8de0e6ed5c..2819bde8d2 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -14,130 +14,129 @@ * limitations under the License. */ - #pragma once +#pragma once - #include - #include - #include - #include - - namespace raft { - namespace linalg { - namespace detail { +#include +#include +#include +#include - // Kernel to perform reductions along the strided dimension - // of the matrix, i.e. reduce along columns for row major or reduce along rows - // for column major layout - template - __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, - int N, Type init, MainLambda main_op) { - // Thread reduction - Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; - if (colStart < D) { - int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; - for (int j = rowStart; j < N; j += stride) { - int idx = colStart + j * D; - thread_data += main_op(data[idx], j); - } - } - - // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - Type *temp = (Type *)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; - temp[myidx] = thread_data; - __syncthreads(); - for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x]; - __syncthreads(); - } - - // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) - raft::myAtomicAdd(dots + colStart, temp[myidx]); - } - - // Kernel to perform reductions along the strided dimension - // of the matrix, i.e. reduce along columns for row major or reduce along rows - // for column major layout - template - __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, - int N, OutType init, MainLambda main_op, - ReduceLambda reduce_op) { - // Thread reduction - OutType thread_data = init; - IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; - if (colStart < D) { - IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; - IdxType stride = blockDim.y * gridDim.y; - for (IdxType j = rowStart; j < N; j += stride) { - IdxType idx = colStart + j * D; - thread_data = reduce_op(thread_data, main_op(data[idx], j)); - } - } - - // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - auto *temp = (OutType *)tmp; // Cast to desired type - IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); - temp[myidx] = thread_data; - __syncthreads(); - for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) - temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); - __syncthreads(); - } - - // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) - raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op); - } - - template , - typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> - void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), - ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { - ///@todo: this extra should go away once we have eliminated the need - /// for atomics in stridedKernel (redesign for this is already underway) - if (!inplace) - raft::linalg::unaryOp( - dots, dots, D, [init] __device__(OutType a) { return init; }, stream); - - // Arbitrary numbers for now, probably need to tune - const dim3 thrds(32, 16); - IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; - const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), - raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); - const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; - - ///@todo: this complication should go away once we have eliminated the need - /// for atomics in stridedKernel (redesign for this is already underway) - if constexpr (std::is_same>::value && - std::is_same::value) - stridedSummationKernel - <<>>(dots, data, D, N, init, main_op); - else - stridedReductionKernel - <<>>(dots, data, D, N, init, main_op, - reduce_op); - - ///@todo: this complication should go away once we have eliminated the need - /// for atomics in stridedKernel (redesign for this is already underway) - // Perform final op on output data - if (!std::is_same>::value) - raft::linalg::unaryOp(dots, dots, D, final_op, stream); - } - - }; // end namespace detail - }; // end namespace linalg - }; // end namespace raft - \ No newline at end of file +namespace raft { +namespace linalg { +namespace detail { + +// Kernel to perform reductions along the strided dimension +// of the matrix, i.e. reduce along columns for row major or reduce along rows +// for column major layout +template +__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, + int N, Type init, MainLambda main_op) { + // Thread reduction + Type thread_data = Type(init); + int colStart = blockIdx.x * blockDim.x + threadIdx.x; + if (colStart < D) { + int rowStart = blockIdx.y * blockDim.y + threadIdx.y; + int stride = blockDim.y * gridDim.y; + for (int j = rowStart; j < N; j += stride) { + int idx = colStart + j * D; + thread_data += main_op(data[idx], j); + } + } + + // Block reduction + extern __shared__ char tmp[]; // One element per thread in block + Type *temp = (Type *)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; + temp[myidx] = thread_data; + __syncthreads(); + for (int j = blockDim.y / 2; j > 0; j /= 2) { + if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x]; + __syncthreads(); + } + + // Grid reduction + if ((colStart < D) && (threadIdx.y == 0)) + raft::myAtomicAdd(dots + colStart, temp[myidx]); +} + +// Kernel to perform reductions along the strided dimension +// of the matrix, i.e. reduce along columns for row major or reduce along rows +// for column major layout +template +__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, + int N, OutType init, MainLambda main_op, + ReduceLambda reduce_op) { + // Thread reduction + OutType thread_data = init; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + if (colStart < D) { + IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; + IdxType stride = blockDim.y * gridDim.y; + for (IdxType j = rowStart; j < N; j += stride) { + IdxType idx = colStart + j * D; + thread_data = reduce_op(thread_data, main_op(data[idx], j)); + } + } + + // Block reduction + extern __shared__ char tmp[]; // One element per thread in block + auto *temp = (OutType *)tmp; // Cast to desired type + IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); + temp[myidx] = thread_data; + __syncthreads(); + for (int j = blockDim.y / 2; j > 0; j /= 2) { + if (threadIdx.y < j) + temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + __syncthreads(); + } + + // Grid reduction + if ((colStart < D) && (threadIdx.y == 0)) + raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op); +} + +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, + OutType init, cudaStream_t stream, bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) { + ///@todo: this extra should go away once we have eliminated the need + /// for atomics in stridedKernel (redesign for this is already underway) + if (!inplace) + raft::linalg::unaryOp( + dots, dots, D, [init] __device__(OutType a) { return init; }, stream); + + // Arbitrary numbers for now, probably need to tune + const dim3 thrds(32, 16); + IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), + raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); + const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; + + ///@todo: this complication should go away once we have eliminated the need + /// for atomics in stridedKernel (redesign for this is already underway) + if constexpr (std::is_same>::value && + std::is_same::value) + stridedSummationKernel + <<>>(dots, data, D, N, init, main_op); + else + stridedReductionKernel + <<>>(dots, data, D, N, init, main_op, + reduce_op); + + ///@todo: this complication should go away once we have eliminated the need + /// for atomics in stridedKernel (redesign for this is already underway) + // Perform final op on output data + if (!std::is_same>::value) + raft::linalg::unaryOp(dots, dots, D, final_op, stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index e6faa883de..a58888a24f 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -14,40 +14,39 @@ * limitations under the License. */ - #pragma once +#pragma once - #include - #include - #include - - namespace raft { - namespace linalg { - namespace detail { - - template - __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - //TODO: kernel do not use shared memory in current implementation - int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] - *singleScalarDev; - } - } - - template - void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { - // Just for the note - there is no way to express such operation with cuBLAS in effective way - // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda - const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); - subtract_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); - CUDA_CHECK(cudaPeekAtLastError()); - } - - }; // end namespace detail - }; // end namespace linalg - }; // end namespace raft - \ No newline at end of file +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, + IdxType len) { + //TODO: kernel do not use shared memory in current implementation + int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; + if (i < len) { + outDev[i] = inDev[i] - *singleScalarDev; + } +} + +template +void subtractDevScalar(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { + // Just for the note - there is no way to express such operation with cuBLAS in effective way + // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda + const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); + subtract_dev_scalar_kernel + <<>>(outDev, inDev, singleScalarDev, len); + CUDA_CHECK(cudaPeekAtLastError()); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh index 5e2ace1ad2..60ac47c501 100644 --- a/cpp/include/raft/linalg/detail/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.cuh @@ -14,84 +14,84 @@ * limitations under the License. */ - #pragma once - - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - - namespace raft { - namespace linalg { - namespace detail { - - template - void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, - T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, - bool trans_right, bool gen_left_vec, bool gen_right_vec, - cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); - - #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 - // 46340: sqrt of max int value - ASSERT(n_rows <= 46340, - "svd solver is not supported for the data that has more than 46340 " - "samples (rows) " - "if you are using CUDA version <11. Please use other solvers such as " - "eig if it is available."); - #endif - - const int m = n_rows; - const int n = n_cols; - - rmm::device_scalar devInfo(stream); - T *d_rwork = nullptr; - - int lwork = 0; - CUSOLVER_CHECK( - cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); - rmm::device_uvector d_work(lwork, stream); - - char jobu = 'S'; - char jobvt = 'A'; - - if (!gen_left_vec) { - char new_u = 'N'; - strcpy(&jobu, &new_u); - } - - if (!gen_right_vec) { - char new_vt = 'N'; - strcpy(&jobvt, &new_vt); - } - - CUSOLVER_CHECK(cusolverDngesvd( - cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, - right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); - - // Transpose the right singular vector back - if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); - - CUDA_CHECK(cudaGetLastError()); - - int dev_info; - raft::update_host(&dev_info, devInfo.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT(dev_info == 0, - "svd.cuh: svd couldn't converge to a solution. " - "This usually occurs when some of the features do not vary enough."); - } - - template +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, + T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, + bool trans_right, bool gen_left_vec, bool gen_right_vec, + cudaStream_t stream) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); + +#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 + // 46340: sqrt of max int value + ASSERT(n_rows <= 46340, + "svd solver is not supported for the data that has more than 46340 " + "samples (rows) " + "if you are using CUDA version <11. Please use other solvers such as " + "eig if it is available."); +#endif + + const int m = n_rows; + const int n = n_cols; + + rmm::device_scalar devInfo(stream); + T *d_rwork = nullptr; + + int lwork = 0; + CUSOLVER_CHECK( + cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + rmm::device_uvector d_work(lwork, stream); + + char jobu = 'S'; + char jobvt = 'A'; + + if (!gen_left_vec) { + char new_u = 'N'; + strcpy(&jobu, &new_u); + } + + if (!gen_right_vec) { + char new_vt = 'N'; + strcpy(&jobvt, &new_vt); + } + + CUSOLVER_CHECK(cusolverDngesvd( + cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, + right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); + + // Transpose the right singular vector back + if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); + + CUDA_CHECK(cudaGetLastError()); + + int dev_info; + raft::update_host(&dev_info, devInfo.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + ASSERT(dev_info == 0, + "svd.cuh: svd couldn't converge to a solution. " + "This usually occurs when some of the features do not vary enough."); +} + +template void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, T *U, T *V, bool gen_left_vec, cudaStream_t stream) { cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); @@ -106,7 +106,8 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); - raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); + raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, + stream); raft::matrix::colReverse(V, n_cols, n_cols, stream); raft::matrix::rowReverse(S, n_cols, 1, stream); @@ -121,82 +122,81 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, } } - template - void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - math_t *sing_vals, math_t *left_sing_vecs, - math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, - math_t tol, int max_sweeps, cudaStream_t stream) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - gesvdjInfo_t gesvdj_params = NULL; - - CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params)); - CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol)); - CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps)); - - int m = n_rows; - int n = n_cols; - - rmm::device_scalar devInfo(stream); - - int lwork = 0; - int econ = 1; - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); - - rmm::device_uvector d_work(lwork, stream); - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), - gesvdj_params, stream)); - - CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); - } - - template - bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, - math_t *S_vec, math_t *V, int n_rows, int n_cols, - int k, math_t tol, cudaStream_t stream) { - cublasHandle_t cublasH = handle.get_cublas_handle(); - - int m = n_rows, n = n_cols; - - // form product matrix - rmm::device_uvector P_d(m * n, stream); - rmm::device_uvector S_mat(k * k, stream); - CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); - CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); - - raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream); - svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream); - - // get norms of each - math_t normA = raft::matrix::getL2Norm(handle, A_d, m * n, stream); - math_t normU = raft::matrix::getL2Norm(handle, U, m * k, stream); - math_t normS = raft::matrix::getL2Norm(handle, S_mat.data(), k * k, stream); - math_t normV = raft::matrix::getL2Norm(handle, V, n * k, stream); - math_t normP = raft::matrix::getL2Norm(handle, P_d.data(), m * n, stream); - - // calculate percent error - const math_t alpha = 1.0, beta = -1.0; - rmm::device_uvector A_minus_P(m * n, stream); - CUDA_CHECK( - cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, - &alpha, A_d, m, &beta, P_d.data(), m, - A_minus_P.data(), m, stream)); - - math_t norm_A_minus_P = - raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); - math_t percent_error = 100.0 * norm_A_minus_P / normA; - return (percent_error / 100.0 < tol); - } - - }; // end namespace detail - }; // end namespace linalg - }; // end namespace raft - \ No newline at end of file +template +void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, + math_t *sing_vals, math_t *left_sing_vecs, + math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, + math_t tol, int max_sweeps, cudaStream_t stream) { + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + + gesvdjInfo_t gesvdj_params = NULL; + + CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params)); + CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol)); + CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps)); + + int m = n_rows; + int n = n_cols; + + rmm::device_scalar devInfo(stream); + + int lwork = 0; + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, + left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); + + rmm::device_uvector d_work(lwork, stream); + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, + left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), + gesvdj_params, stream)); + + CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); +} + +template +bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, + math_t *S_vec, math_t *V, int n_rows, int n_cols, + int k, math_t tol, cudaStream_t stream) { + cublasHandle_t cublasH = handle.get_cublas_handle(); + + int m = n_rows, n = n_cols; + + // form product matrix + rmm::device_uvector P_d(m * n, stream); + rmm::device_uvector S_mat(k * k, stream); + CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); + CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); + + raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream); + svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream); + + // get norms of each + math_t normA = raft::matrix::getL2Norm(handle, A_d, m * n, stream); + math_t normU = raft::matrix::getL2Norm(handle, U, m * k, stream); + math_t normS = raft::matrix::getL2Norm(handle, S_mat.data(), k * k, stream); + math_t normV = raft::matrix::getL2Norm(handle, V, n * k, stream); + math_t normP = raft::matrix::getL2Norm(handle, P_d.data(), m * n, stream); + + // calculate percent error + const math_t alpha = 1.0, beta = -1.0; + rmm::device_uvector A_minus_P(m * n, stream); + CUDA_CHECK( + cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, + &alpha, A_d, m, &beta, P_d.data(), m, + A_minus_P.data(), m, stream)); + + math_t norm_A_minus_P = + raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; + return (percent_error / 100.0 < tol); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh index 8502dc26f4..190205fea0 100644 --- a/cpp/include/raft/linalg/detail/unary_op.cuh +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -14,101 +14,100 @@ * limitations under the License. */ - #pragma once +#pragma once - #include - #include - #include - - namespace raft { - namespace linalg { - namespace detail { - - template - __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, - Lambda op) { - typedef TxN_t InVecType; - typedef TxN_t OutVecType; - InVecType a; - OutVecType b; - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= InVecType::Ratio; - if (idx >= len) return; - a.load(in, idx); - #pragma unroll - for (int i = 0; i < InVecType::Ratio; ++i) { - b.val.data[i] = op(a.val.data[i]); - } - b.store(out, idx); - } - - template - void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); - unaryOpKernel - <<>>(out, in, len, op); - CUDA_CHECK(cudaPeekAtLastError()); - } - - template - void unaryOpCaller(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; //silently skip in case of 0 length input - constexpr auto maxSize = - sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && - outAddr % 16 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && - outAddr % 8 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && - outAddr % 4 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && - outAddr % 2 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (1 / maxSize) { - unaryOpImpl( - out, in, len, op, stream); - } else { - unaryOpImpl(out, in, len, op, - stream); - } - } - - template - __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { - op(out + idx, idx); - } - } - - template - void writeOnlyUnaryOpCaller(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; // silently skip in case of 0 length input - auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel - <<>>(out, len, op); - CUDA_CHECK(cudaGetLastError()); - } - - }; // end namespace detail - }; // end namespace linalg - }; // end namespace raft - \ No newline at end of file +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, + Lambda op) { + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a; + OutVecType b; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + b.val.data[i] = op(a.val.data[i]); + } + b.store(out, idx); +} + +template +void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, + cudaStream_t stream) { + const IdxType nblks = + raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + unaryOpKernel + <<>>(out, in, len, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +template +void unaryOpCaller(OutType *out, const InType *in, IdxType len, Lambda op, + cudaStream_t stream) { + if (len <= 0) return; //silently skip in case of 0 length input + constexpr auto maxSize = + sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && + outAddr % 16 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && + outAddr % 8 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && + outAddr % 4 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && + outAddr % 2 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (1 / maxSize) { + unaryOpImpl( + out, in, len, op, stream); + } else { + unaryOpImpl(out, in, len, op, + stream); + } +} + +template +__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + if (idx < len) { + op(out + idx, idx); + } +} + +template +void writeOnlyUnaryOpCaller(OutType *out, IdxType len, Lambda op, + cudaStream_t stream) { + if (len <= 0) return; // silently skip in case of 0 length input + auto nblks = raft::ceildiv(len, TPB); + writeOnlyUnaryOpKernel + <<>>(out, len, op); + CUDA_CHECK(cudaGetLastError()); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp index 56d01be990..e4eead777c 100644 --- a/cpp/include/raft/linalg/divide.hpp +++ b/cpp/include/raft/linalg/divide.hpp @@ -38,9 +38,7 @@ using detail::divides_scalar; template void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cudaStream_t stream) { - unaryOp( - out, in, len, divides_scalar(scalar), - stream); + unaryOp(out, in, len, divides_scalar(scalar), stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp index 2659f3d6b8..288b43f27f 100644 --- a/cpp/include/raft/linalg/eig.hpp +++ b/cpp/include/raft/linalg/eig.hpp @@ -41,9 +41,9 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); } +using detail::COPY_INPUT; using detail::EigVecMemUsage; using detail::OVERWRITE_INPUT; -using detail::COPY_INPUT; #if CUDART_VERSION >= 10010 @@ -65,7 +65,8 @@ template void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, EigVecMemUsage memUsage, cudaStream_t stream) { - detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream); + detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, + eig_vals, memUsage, stream); } #endif @@ -88,7 +89,8 @@ template void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); + detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, + tol, sweeps); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp index 90bf608e11..63b824e6f7 100644 --- a/cpp/include/raft/linalg/eltwise.hpp +++ b/cpp/include/raft/linalg/eltwise.hpp @@ -40,9 +40,8 @@ using detail::adds_scalar; template void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp( - out, in, len, adds_scalar(scalar), - stream); + raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), + stream); } using detail::multiplies_scalar; @@ -50,9 +49,8 @@ using detail::multiplies_scalar; template void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp( - out, in, len, multiplies_scalar(scalar), - stream); + raft::linalg::unaryOp(out, in, len, + multiplies_scalar(scalar), stream); } /** @} */ @@ -70,33 +68,25 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, template void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, thrust::plus(), - stream); + binaryOp(out, in1, in2, len, thrust::plus(), stream); } template void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, thrust::minus(), - stream); + binaryOp(out, in1, in2, len, thrust::minus(), stream); } template void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, thrust::multiplies(), - stream); + binaryOp(out, in1, in2, len, thrust::multiplies(), stream); } template void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, thrust::divides(), - stream); + binaryOp(out, in1, in2, len, thrust::divides(), stream); } using detail::divides_check_zero; @@ -104,10 +94,7 @@ using detail::divides_check_zero; template void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, - divides_check_zero(), - stream); + binaryOp(out, in1, in2, len, divides_check_zero(), stream); } /** @} */ diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp index 3e8ac5b768..9326714a41 100644 --- a/cpp/include/raft/linalg/gemm.hpp +++ b/cpp/include/raft/linalg/gemm.hpp @@ -44,7 +44,8 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, math_t beta, cudaStream_t stream) { - detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); + detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, + trans_b, alpha, beta, stream); } template @@ -81,7 +82,8 @@ template void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { - detail::gemm(handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta); + detail::gemm(handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, + isYColMajor, stream, alpha, beta); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 2fa8d1dd0d..28c4ff8238 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -83,8 +83,10 @@ int computeSmallestEigenvectors( value_type_t *__restrict__ lanczosVecs_dev, value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { - return raft::detail::computeSmallestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, totalIter, shift, - alpha_host, beta_host, lanczosVecs_dev, work_dev, eigVals_dev, eigVecs_dev, seed); + return raft::detail::computeSmallestEigenvectors( + handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, + totalIter, shift, alpha_host, beta_host, lanczosVecs_dev, work_dev, + eigVals_dev, eigVecs_dev, seed); } /** @@ -131,7 +133,9 @@ int computeSmallestEigenvectors( value_type_t tol, bool reorthogonalize, index_type_t &iter, value_type_t *__restrict__ eigVals_dev, value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { - return raft::detail::computeSmallestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, eigVals_dev, eigVecs_dev, seed); + return raft::detail::computeSmallestEigenvectors( + handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, + eigVals_dev, eigVecs_dev, seed); } // ========================================================= @@ -192,8 +196,10 @@ int computeLargestEigenvectors( value_type_t *__restrict__ lanczosVecs_dev, value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { - return raft::detail::computeLargestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, totalIter, alpha_host, beta_host, - lanczosVecs_dev, work_dev, eigVals_dev, eigVecs_dev, seed); + return raft::detail::computeLargestEigenvectors( + handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, + totalIter, alpha_host, beta_host, lanczosVecs_dev, work_dev, eigVals_dev, + eigVecs_dev, seed); } /** @@ -240,7 +246,9 @@ int computeLargestEigenvectors( value_type_t tol, bool reorthogonalize, index_type_t &iter, value_type_t *__restrict__ eigVals_dev, value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { - return raft::detail::computeLargestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, eigVals_dev, eigVecs_dev, seed); + return raft::detail::computeLargestEigenvectors( + handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, + eigVals_dev, eigVecs_dev, seed); } } // namespace raft diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp index 0c9a2d0b10..40e6253af9 100644 --- a/cpp/include/raft/linalg/map.hpp +++ b/cpp/include/raft/linalg/map.hpp @@ -40,8 +40,8 @@ template void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, const InType *in, Args... args) { - detail::mapImpl(out, len, map, stream, in, - args...); + detail::mapImpl(out, len, map, stream, + in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp index 6cd58a43dc..75baf86e1c 100644 --- a/cpp/include/raft/linalg/map_then_reduce.hpp +++ b/cpp/include/raft/linalg/map_then_reduce.hpp @@ -39,8 +39,9 @@ template void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, const InType *in, Args... args) { - detail::mapThenReduceImpl( - out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...); + detail::mapThenReduceImpl(out, len, (OutType)0, map, + detail::sum_tag(), stream, in, args...); } /** diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp index 7ef02735ae..2cfaa0564c 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.hpp +++ b/cpp/include/raft/linalg/matrix_vector_op.hpp @@ -48,7 +48,8 @@ template void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, cudaStream_t stream) { - detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); + detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, + stream); } /** @@ -79,7 +80,8 @@ template void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, const Type *vec2, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, cudaStream_t stream) { - detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); + detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, + bcastAlongRows, op, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp index 588169c580..3935e648dc 100644 --- a/cpp/include/raft/linalg/strided_reduction.hpp +++ b/cpp/include/raft/linalg/strided_reduction.hpp @@ -57,7 +57,8 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, + reduce_op, final_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp index 7f1651cc81..970c339090 100644 --- a/cpp/include/raft/linalg/svd.hpp +++ b/cpp/include/raft/linalg/svd.hpp @@ -44,7 +44,9 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, bool trans_right, bool gen_left_vec, bool gen_right_vec, cudaStream_t stream) { - detail::svdQR(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, right_sing_vecs, trans_right, gen_left_vec, gen_right_vec, stream); + detail::svdQR(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, + right_sing_vecs, trans_right, gen_left_vec, gen_right_vec, + stream); } template @@ -75,7 +77,9 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, math_t *sing_vals, math_t *left_sing_vecs, math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, math_t tol, int max_sweeps, cudaStream_t stream) { - detail::svdJacobi(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, right_sing_vecs, gen_left_vec, gen_right_vec, tol, max_sweeps, stream); + detail::svdJacobi(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, + right_sing_vecs, gen_left_vec, gen_right_vec, tol, + max_sweeps, stream); } /** @@ -122,7 +126,8 @@ template bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, math_t *S_vec, math_t *V, int n_rows, int n_cols, int k, math_t tol, cudaStream_t stream) { - return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream); + return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, + k, tol, stream); } }; // end namespace linalg diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index 5c525ce791..765a38f583 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -69,9 +69,9 @@ class EigSelTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream); - raft::linalg::eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3, - eig_vectors.data(), eig_vals.data(), - EigVecMemUsage::OVERWRITE_INPUT, stream); + raft::linalg::eigSelDC(handle, cov_matrix.data(), params.n_row, + params.n_col, 3, eig_vectors.data(), eig_vals.data(), + EigVecMemUsage::OVERWRITE_INPUT, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } From f7d43b568621b64c7e0d546ba25b33eb70a97ad6 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 17 Nov 2021 11:33:21 -0800 Subject: [PATCH 04/17] correcting include --- cpp/include/raft/linalg/mean_squared_error.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp index 9d1538c172..89d91719c7 100644 --- a/cpp/include/raft/linalg/mean_squared_error.hpp +++ b/cpp/include/raft/linalg/mean_squared_error.hpp @@ -16,7 +16,7 @@ #pragma once -#include "map_then_reduce.cuh" +#include "map_then_reduce.hpp" namespace raft { namespace linalg { From 9c0d6551b38b05110218d9c4efa7852951a7a488 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Dec 2021 12:04:50 -0800 Subject: [PATCH 05/17] removing deleted file again --- cpp/include/raft/sparse/selection/knn.cuh | 436 ---------------------- 1 file changed, 436 deletions(-) delete mode 100644 cpp/include/raft/sparse/selection/knn.cuh diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh deleted file mode 100644 index 631a740bfb..0000000000 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace sparse { -namespace selection { - -template -struct csr_batcher_t { - csr_batcher_t(value_idx batch_size, value_idx n_rows, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data) - : batch_start_(0), - batch_stop_(0), - batch_rows_(0), - total_rows_(n_rows), - batch_size_(batch_size), - csr_indptr_(csr_indptr), - csr_indices_(csr_indices), - csr_data_(csr_data), - batch_csr_start_offset_(0), - batch_csr_stop_offset_(0) {} - - void set_batch(int batch_num) { - batch_start_ = batch_num * batch_size_; - batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing - - if (batch_stop_ >= total_rows_) - batch_stop_ = total_rows_ - 1; // zero-based indexing - - batch_rows_ = (batch_stop_ - batch_start_) + 1; - } - - value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_indptr( - batch_start_, batch_stop_, csr_indptr_, batch_indptr, - &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); - - return batch_csr_stop_offset_ - batch_csr_start_offset_; - } - - void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_populate( - batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, - csr_indices, csr_data, stream); - } - - value_idx batch_rows() const { return batch_rows_; } - - value_idx batch_start() const { return batch_start_; } - - value_idx batch_stop() const { return batch_stop_; } - - private: - value_idx batch_size_; - value_idx batch_start_; - value_idx batch_stop_; - value_idx batch_rows_; - - value_idx total_rows_; - - const value_idx *csr_indptr_; - const value_idx *csr_indices_; - const value_t *csr_data_; - - value_idx batch_csr_start_offset_; - value_idx batch_csr_stop_offset_; -}; - -template -class sparse_knn_t { - public: - sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, - const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, - int n_idx_cols_, const value_idx *queryIndptr_, - const value_idx *queryIndices_, const value_t *queryData_, - size_t queryNNZ_, int n_query_rows_, int n_query_cols_, - value_idx *output_indices_, value_t *output_dists_, int k_, - const raft::handle_t &handle_, - size_t batch_size_index_ = 2 << 14, // approx 1M - size_t batch_size_query_ = 2 << 14, - raft::distance::DistanceType metric_ = - raft::distance::DistanceType::L2Expanded, - float metricArg_ = 0) - : idxIndptr(idxIndptr_), - idxIndices(idxIndices_), - idxData(idxData_), - idxNNZ(idxNNZ_), - n_idx_rows(n_idx_rows_), - n_idx_cols(n_idx_cols_), - queryIndptr(queryIndptr_), - queryIndices(queryIndices_), - queryData(queryData_), - queryNNZ(queryNNZ_), - n_query_rows(n_query_rows_), - n_query_cols(n_query_cols_), - output_indices(output_indices_), - output_dists(output_dists_), - k(k_), - handle(handle_), - batch_size_index(batch_size_index_), - batch_size_query(batch_size_query_), - metric(metric_), - metricArg(metricArg_) {} - - void run() { - using namespace raft::sparse; - - int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); - csr_batcher_t query_batcher( - batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData); - - size_t rows_processed = 0; - - for (int i = 0; i < n_batches_query; i++) { - /** - * Compute index batch info - */ - query_batcher.set_batch(i); - - /** - * Slice CSR to rows in batch - */ - - rmm::device_uvector query_batch_indptr( - query_batcher.batch_rows() + 1, handle.get_stream()); - - value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( - query_batch_indptr.data(), handle.get_stream()); - - rmm::device_uvector query_batch_indices(n_query_batch_nnz, - handle.get_stream()); - rmm::device_uvector query_batch_data(n_query_batch_nnz, - handle.get_stream()); - - query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), - query_batch_data.data(), - handle.get_stream()); - - // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent - // batches and 1 space for the results of the merge, which get copied back to the top - rmm::device_uvector merge_buffer_indices(0, - handle.get_stream()); - rmm::device_uvector merge_buffer_dists(0, handle.get_stream()); - - value_t *dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_ptr; - - int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); - csr_batcher_t idx_batcher( - batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData); - - for (int j = 0; j < n_batches_idx; j++) { - idx_batcher.set_batch(j); - - merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); - merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); - - /** - * Slice CSR to rows in batch - */ - rmm::device_uvector idx_batch_indptr( - idx_batcher.batch_rows() + 1, handle.get_stream()); - rmm::device_uvector idx_batch_indices(0, - handle.get_stream()); - rmm::device_uvector idx_batch_data(0, handle.get_stream()); - - value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( - idx_batch_indptr.data(), handle.get_stream()); - - idx_batch_indices.resize(idx_batch_nnz, handle.get_stream()); - idx_batch_data.resize(idx_batch_nnz, handle.get_stream()); - - idx_batcher.get_batch_csr_indices_data( - idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream()); - - /** - * Compute distances - */ - size_t dense_size = - idx_batcher.batch_rows() * query_batcher.batch_rows(); - rmm::device_uvector batch_dists(dense_size, - handle.get_stream()); - - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, - batch_dists.size() * sizeof(value_t))); - - compute_distances(idx_batcher, query_batcher, idx_batch_nnz, - n_query_batch_nnz, idx_batch_indptr.data(), - idx_batch_indices.data(), idx_batch_data.data(), - query_batch_indptr.data(), query_batch_indices.data(), - query_batch_data.data(), batch_dists.data()); - - // Build batch indices array - rmm::device_uvector batch_indices(batch_dists.size(), - handle.get_stream()); - - // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); - - iota_fill(batch_indices.data(), batch_rows, batch_cols, - handle.get_stream()); - - /** - * Perform k-selection on batch & merge with other k-selections - */ - size_t merge_buffer_offset = batch_rows * k; - dists_merge_buffer_ptr = - merge_buffer_dists.data() + merge_buffer_offset; - indices_merge_buffer_ptr = - merge_buffer_indices.data() + merge_buffer_offset; - - perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), - batch_indices.data(), dists_merge_buffer_ptr, - indices_merge_buffer_ptr); - - value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; - - // Merge results of difference batches if necessary - if (idx_batcher.batch_start() > 0) { - size_t merge_buffer_tmp_out = batch_rows * k * 2; - dists_merge_buffer_tmp_ptr = - merge_buffer_dists.data() + merge_buffer_tmp_out; - indices_merge_buffer_tmp_ptr = - merge_buffer_indices.data() + merge_buffer_tmp_out; - - merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), - merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, - indices_merge_buffer_tmp_ptr); - } - - // copy merged output back into merge buffer partition for next iteration - raft::copy_async(merge_buffer_indices.data(), - indices_merge_buffer_tmp_ptr, - batch_rows * k, handle.get_stream()); - raft::copy_async(merge_buffer_dists.data(), - dists_merge_buffer_tmp_ptr, batch_rows * k, - handle.get_stream()); - } - - // Copy final merged batch to output array - raft::copy_async( - output_indices + (rows_processed * k), merge_buffer_indices.data(), - query_batcher.batch_rows() * k, handle.get_stream()); - raft::copy_async( - output_dists + (rows_processed * k), merge_buffer_dists.data(), - query_batcher.batch_rows() * k, handle.get_stream()); - - rows_processed += query_batcher.batch_rows(); - } - } - - private: - void merge_batches(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - value_t *merge_buffer_dists, - value_idx *merge_buffer_indices, value_t *out_dists, - value_idx *out_indices) { - // build translation buffer to shift resulting indices by the batch - std::vector id_ranges; - id_ranges.push_back(0); - id_ranges.push_back(idx_batcher.batch_start()); - - rmm::device_uvector trans(id_ranges.size(), handle.get_stream()); - raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), - handle.get_stream()); - - // combine merge buffers only if there's more than 1 partition to combine - raft::spatial::knn::knn_merge_parts( - merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, - query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data()); - } - - void perform_k_selection(csr_batcher_t idx_batcher, - csr_batcher_t query_batcher, - value_t *batch_dists, value_idx *batch_indices, - value_t *out_dists, value_idx *out_indices) { - // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); - - // build translation buffer to shift resulting indices by the batch - std::vector id_ranges; - id_ranges.push_back(0); - id_ranges.push_back(idx_batcher.batch_start()); - - // in the case where the number of idx rows in the batch is < k, we - // want to adjust k. - value_idx n_neighbors = min(k, batch_cols); - - bool ascending = true; - if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; - - // kernel to slice first (min) k cols and copy into batched merge buffer - raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows, - batch_cols, out_dists, out_indices, ascending, - n_neighbors, handle.get_stream()); - } - - void compute_distances(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - size_t idx_batch_nnz, size_t query_batch_nnz, - value_idx *idx_batch_indptr, - value_idx *idx_batch_indices, value_t *idx_batch_data, - value_idx *query_batch_indptr, - value_idx *query_batch_indices, - value_t *query_batch_data, value_t *batch_dists) { - /** - * Compute distances - */ - raft::sparse::distance::distances_config_t dist_config( - handle); - dist_config.b_nrows = idx_batcher.batch_rows(); - dist_config.b_ncols = n_idx_cols; - dist_config.b_nnz = idx_batch_nnz; - - dist_config.b_indptr = idx_batch_indptr; - dist_config.b_indices = idx_batch_indices; - dist_config.b_data = idx_batch_data; - - dist_config.a_nrows = query_batcher.batch_rows(); - dist_config.a_ncols = n_query_cols; - dist_config.a_nnz = query_batch_nnz; - - dist_config.a_indptr = query_batch_indptr; - dist_config.a_indices = query_batch_indices; - dist_config.a_data = query_batch_data; - - if (raft::sparse::distance::supportedDistance.find(metric) == - raft::sparse::distance::supportedDistance.end()) - THROW("DistanceType not supported: %d", metric); - - raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, - metricArg); - } - - const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; - value_idx *output_indices; - const value_t *idxData, *queryData; - value_t *output_dists; - - size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; - - raft::distance::DistanceType metric; - - float metricArg; - - int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; - - const raft::handle_t &handle; -}; - -/** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNA number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) - */ -template -void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, - const value_t *idxData, size_t idxNNZ, int n_idx_rows, - int n_idx_cols, const value_idx *queryIndptr, - const value_idx *queryIndices, const value_t *queryData, - size_t queryNNZ, int n_query_rows, int n_query_cols, - value_idx *output_indices, value_t *output_dists, int k, - const raft::handle_t &handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - sparse_knn_t( - idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, - queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, - output_indices, output_dists, k, handle, batch_size_index, batch_size_query, - metric, metricArg) - .run(); -} - -}; // namespace selection -}; // namespace sparse -}; // namespace raft From a071d09dab8ae6850d35d550cbc4e368d1f9ee70 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Dec 2021 12:52:10 -0800 Subject: [PATCH 06/17] correcting merges and passing tests --- cpp/include/raft/linalg/add.hpp | 41 +- cpp/include/raft/linalg/binary_op.hpp | 12 +- .../raft/linalg/cholesky_r1_update.hpp | 16 +- .../raft/linalg/coalesced_reduction.hpp | 25 +- cpp/include/raft/linalg/detail/add.cuh | 26 +- cpp/include/raft/linalg/detail/binary_op.cuh | 31 - .../raft/linalg/detail/cholesky_r1_update.hpp | 67 +- .../linalg/detail/coalesced_reduction.cuh | 32 - .../raft/linalg/detail/contractions.cuh | 117 +-- cpp/include/raft/linalg/detail/eig.hpp | 22 +- cpp/include/raft/linalg/detail/functional.cuh | 15 +- cpp/include/raft/linalg/detail/gemm.hpp | 41 - cpp/include/raft/linalg/detail/lanczos.hpp | 778 ++++++++++++------ cpp/include/raft/linalg/detail/map.cuh | 19 +- .../raft/linalg/detail/map_then_reduce.cuh | 90 +- .../raft/linalg/detail/strided_reduction.cuh | 32 - cpp/include/raft/linalg/detail/subtract.cuh | 23 +- cpp/include/raft/linalg/detail/svd.cuh | 64 +- cpp/include/raft/linalg/detail/unary_op.cuh | 72 +- cpp/include/raft/linalg/divide.hpp | 4 +- cpp/include/raft/linalg/eig.hpp | 43 +- cpp/include/raft/linalg/eltwise.hpp | 39 +- cpp/include/raft/linalg/gemm.hpp | 67 +- cpp/include/raft/linalg/lanczos.hpp | 158 +++- cpp/include/raft/linalg/map.hpp | 7 +- cpp/include/raft/linalg/map_then_reduce.hpp | 30 +- cpp/include/raft/linalg/matrix_vector_op.hpp | 33 +- cpp/include/raft/linalg/qr.hpp | 19 +- cpp/include/raft/linalg/strided_reduction.hpp | 25 +- cpp/include/raft/linalg/subtract.hpp | 12 +- cpp/include/raft/linalg/svd.hpp | 121 ++- cpp/include/raft/linalg/unary_op.hpp | 18 +- .../sparse/distance/detail/l2_distance.cuh | 5 - .../raft/sparse/selection/detail/knn.cuh | 2 +- .../raft/spatial/knn/detail/fused_l2_knn.cuh | 2 +- cpp/test/linalg/eig_sel.cu | 12 +- 36 files changed, 1172 insertions(+), 948 deletions(-) diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp index 452cb00051..2a59339c20 100644 --- a/cpp/include/raft/linalg/add.hpp +++ b/cpp/include/raft/linalg/add.hpp @@ -42,16 +42,9 @@ using detail::adds_scalar; * @param stream cuda stream where to launch work */ template -<<<<<<< HEAD:cpp/include/raft/linalg/add.hpp -void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { - unaryOp(out, in, len, adds_scalar(scalar), stream); -======= void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { - auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; - unaryOp(out, in, len, op, stream); ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/add.cuh + unaryOp(out, in, len, adds_scalar(scalar), stream); } /** @@ -68,26 +61,9 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s * @param stream cuda stream where to launch work */ template -<<<<<<< HEAD:cpp/include/raft/linalg/add.hpp -void add(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { - binaryOp(out, in1, in2, len, thrust::plus(), stream); -======= void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) { - auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; - binaryOp(out, in1, in2, len, op, stream); -} - -template -__global__ void add_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) -{ - IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/add.cuh + binaryOp(out, in1, in2, len, thrust::plus(), stream); } /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and @@ -101,24 +77,13 @@ __global__ void add_dev_scalar_kernel(math_t* outDev, * @param stream cuda stream */ template -<<<<<<< HEAD:cpp/include/raft/linalg/add.hpp -void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { - detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream); -======= void addDevScalar(math_t* outDev, const math_t* inDev, const math_t* singleScalarDev, IdxType len, cudaStream_t stream) { - // TODO: block dimension has not been tuned - dim3 block(256); - dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); - RAFT_CUDA_TRY(cudaPeekAtLastError()); ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/add.cuh + detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp index 1e03a1d231..e482240b59 100644 --- a/cpp/include/raft/linalg/binary_op.hpp +++ b/cpp/include/raft/linalg/binary_op.hpp @@ -39,10 +39,14 @@ namespace linalg { * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val1, const InType& val2);` */ -template -void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, - Lambda op, cudaStream_t stream) { +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ detail::binaryOp(out, in1, in2, len, op, stream); } diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp index ed0307cd2b..2428972d85 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp @@ -118,11 +118,17 @@ namespace linalg { * conditioned systems. Negative values mean no regularizaton. */ template -void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { - detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, - stream, eps); +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ + detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps); } }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp index ad5279b1ad..a8f19f61b1 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.hpp +++ b/cpp/include/raft/linalg/coalesced_reduction.hpp @@ -48,17 +48,24 @@ namespace linalg { * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType *dots, const InType *data, int D, int N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { - detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + FinalLambda final_op = raft::Nop()) +{ + detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index be7b8bb299..8459f7924d 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -23,25 +23,27 @@ namespace linalg { namespace detail { template -__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { +__global__ void add_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] + *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } } template -void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void addDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); - CUDA_CHECK(cudaPeekAtLastError()); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // namespace detail diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh index de7ca96fe1..7c9ba2aeed 100644 --- a/cpp/include/raft/linalg/detail/binary_op.cuh +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -60,36 +60,6 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6 return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/binary_op.cuh -template -void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, - Lambda op, cudaStream_t stream) { - constexpr auto maxSize = - sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 16)) { -======= -/** - * @brief perform element-wise binary operation on the input arrays - * @tparam InType input data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam OutType output data-type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in1 the first input array - * @param in2 the second input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - * @note Lambda must be a functor with the following signature: - * `OutType func(const InType& val1, const InType& val2);` - */ template >>>>>> upstream/branch-22.02:cpp/include/raft/linalg/binary_op.cuh binaryOpImpl( out, in1, in2, len, op, stream); } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp index 49bb190836..db00c5d6fc 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp @@ -27,9 +27,16 @@ namespace linalg { namespace detail { template -void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -49,18 +56,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // We need a workspace in device memory to store a scalar. Additionally, in // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; - int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; + int offset = + (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; } - math_t *s = reinterpret_cast(((char *)workspace) + offset); - math_t *L_22 = L + (n - 1) * ld + n - 1; + math_t* s = reinterpret_cast(((char*)workspace) + offset); + math_t* L_22 = L + (n - 1) * ld + n - 1; - math_t *A_new; - math_t *A_row; + math_t* A_new; + math_t* A_row; if (uplo == CUBLAS_FILL_MODE_UPPER) { // A_new is stored as the n-1 th column of L A_new = L + (n - 1) * ld; @@ -69,30 +75,39 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // as the n-th row of L. Since the matrix is column major, this is non // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_row, ld, A_new, 1, stream)); + A_new = reinterpret_cast(workspace); + RAFT_CUBLAS_TRY( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } - cublasOperation_t op = - (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm( - handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, - CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); + RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); + RAFT_CUBLAS_TRY( + raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_new, 1, A_row, ld, stream)); + RAFT_CUBLAS_TRY( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case - CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); } // L_22 = sqrt(A_22 - L_12 * L_12) @@ -100,16 +115,14 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, math_t L_22_host; raft::update_host(&s_host, s, 1, stream); raft::update_host(&L_22_host, L_22, 1, stream); // L_22 stores A_22 - CUDA_CHECK(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); L_22_host = std::sqrt(L_22_host - s_host); // Check for numeric error with sqrt. If the matrix is not positive definit or // the system is very ill conditioned then the A_22 - L_12 * L_12 can be // negative, which would result L_22 = NaN. A small positive eps parameter // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { - L_22_host = eps; - } + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); raft::update_device(L_22, &L_22_host, 1, stream); } diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh index 4a8660741f..bb451bf13a 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh @@ -62,42 +62,10 @@ __global__ void coalescedReductionKernel(OutType* dots, } } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/coalesced_reduction.cuh -template , -======= -/** - * @brief Compute reduction of the input matrix along the leading dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D leading dimension of data - * @param N second dimension data - * @param init initial value to use for the reduction - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results - * @param inplace reduction result added inplace or overwrites old values? - * @param stream cuda stream where to launch work - */ template , ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/coalesced_reduction.cuh typename ReduceLambda = raft::Sum, typename FinalLambda = raft::Nop> void coalescedReduction(OutType* dots, diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh index b04c813cd8..d5dd416c49 100644 --- a/cpp/include/raft/linalg/detail/contractions.cuh +++ b/cpp/include/raft/linalg/detail/contractions.cuh @@ -22,8 +22,7 @@ namespace raft { namespace linalg { namespace detail { -template +template struct Contractions_NT { protected: typedef Policy P; @@ -81,16 +80,15 @@ struct Contractions_NT { public: /** -* @brief Ctor -* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] -* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] -* @param[in] _m number of rows of X -* @param[in] _n number of rows of Y -* @param[in] _k number of cols of X and Y -* @param[in] _smem shared memory region used during computations -*/ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) + * @brief Ctor + * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] + * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] + * @param[in] _m number of rows of X + * @param[in] _n number of rows of Y + * @param[in] _k number of cols of X and Y + * @param[in] _smem shared memory region used during computations + */ + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem) : m(_m), n(_n), k(_k), @@ -107,19 +105,28 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) {} + pageRd(0) + { + } /** -* @brief Ctor -* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] -* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] -* @param[in] _m number of rows of X -* @param[in] _n number of rows of Y -* @param[in] _k number of cols of X and Y -* @param[in] _smem shared memory region used during computations -*/ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) + * @brief Ctor + * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major] + * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major] + * @param[in] _m number of rows of X + * @param[in] _n number of rows of Y + * @param[in] _k number of cols of X and Y + * @param[in] _smem shared memory region used during computations + */ + DI Contractions_NT(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + char* _smem) : m(_m), n(_n), k(_k), @@ -133,50 +140,55 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) { + pageRd(0) + { if (isRowMajor) { xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; } else { xrowid = IdxT(blockIdx.y) * P::Mblk; yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; } } protected: /** -* @brief Load current block of X/Y from global memory to registers -* @param[in] kidx current start index of k to be loaded -*/ - DI void ldgXY(IdxT kidx) { + * @brief Load current block of X/Y from global memory to registers + * @param[in] kidx current start index of k to be loaded + */ + DI void ldgXY(IdxT kidx) + { ldgX(kidx); ldgY(kidx); } /** -* @brief Store current block of X/Y from registers to smem -* @param[in] kidx current start index of k to be loaded -*/ - DI void stsXY() { + * @brief Store current block of X/Y from registers to smem + * @param[in] kidx current start index of k to be loaded + */ + DI void stsXY() + { stsX(sx + pageWr * P::SmemPage); stsY(sy + pageWr * P::SmemPage); } /** -* @brief Load X and Y block from shared memory to registers -* @param[in] kidx k value from the current k-block to be loaded from smem -*/ - DI void ldsXY(int kidx) { + * @brief Load X and Y block from shared memory to registers + * @param[in] kidx k value from the current k-block to be loaded from smem + */ + DI void ldsXY(int kidx) + { ldsX(kidx, sx + pageRd * P::SmemPage); ldsY(kidx, sy + pageRd * P::SmemPage); } private: - DI void ldgX(IdxT kidx) { + DI void ldgX(IdxT kidx) + { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; @@ -193,11 +205,10 @@ struct Contractions_NT { } } else { const auto numRows = k; - auto koffset = scolid; + auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { + if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); } else { #pragma unroll @@ -209,7 +220,8 @@ struct Contractions_NT { } } - DI void ldgY(IdxT kidx) { + DI void ldgY(IdxT kidx) + { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; @@ -229,8 +241,7 @@ struct Contractions_NT { auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { + if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); } else { #pragma unroll @@ -242,7 +253,8 @@ struct Contractions_NT { } } - DI void stsX(DataT* smem) { + DI void stsX(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { @@ -250,7 +262,8 @@ struct Contractions_NT { } } - DI void stsY(DataT* smem) { + DI void stsY(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { @@ -258,7 +271,8 @@ struct Contractions_NT { } } - DI void ldsX(int kidx, DataT* smem) { + DI void ldsX(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll @@ -277,7 +291,8 @@ struct Contractions_NT { } } - DI void ldsY(int kidx, DataT* smem) { + DI void ldsY(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index 997e98dcc4..704fe339dc 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -247,34 +247,22 @@ void eigSelDC(const raft::handle_t& handle, #endif template -<<<<<<< HEAD:cpp/include/raft/linalg/detail/eig.hpp -void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, - int n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - - syevjInfo_t syevj_params = nullptr; - CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); - CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); -======= void eigJacobi(const raft::handle_t& handle, const math_t* in, - std::size_t n_rows, - std::size_t n_cols, + int n_rows, + int n_cols, math_t* eig_vectors, math_t* eig_vals, cudaStream_t stream, - math_t tol = 1.e-7, - std::uint32_t sweeps = 15) + math_t tol = 1.e-7, + int sweeps = 15) { cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params)); RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/eig.cuh + RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); int lwork; RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH, diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh index fec2e27228..4cebd34d08 100644 --- a/cpp/include/raft/linalg/detail/functional.cuh +++ b/cpp/include/raft/linalg/detail/functional.cuh @@ -27,9 +27,7 @@ struct divides_scalar { public: divides_scalar(ArgType scalar) : scalar_(scalar) {} - __host__ __device__ inline ReturnType operator()(ArgType in) { - return in / scalar_; - } + __host__ __device__ inline ReturnType operator()(ArgType in) { return in / scalar_; } private: ArgType scalar_; @@ -40,9 +38,7 @@ struct adds_scalar { public: adds_scalar(ArgType scalar) : scalar_(scalar) {} - __host__ __device__ inline ReturnType operator()(ArgType in) { - return in + scalar_; - } + __host__ __device__ inline ReturnType operator()(ArgType in) { return in + scalar_; } private: ArgType scalar_; @@ -53,9 +49,7 @@ struct multiplies_scalar { public: multiplies_scalar(ArgType scalar) : scalar_(scalar) {} - __host__ __device__ inline ReturnType operator()(ArgType in) { - return in * scalar_; - } + __host__ __device__ inline ReturnType operator()(ArgType in) { return in * scalar_; } private: ArgType scalar_; @@ -64,7 +58,8 @@ struct multiplies_scalar { template struct divides_check_zero { public: - __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) { + __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) + { return (b == static_cast(0)) ? 0.0 : a / b; } }; diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 8adeb5295f..8a74e78a79 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -52,47 +52,6 @@ void gemm(const raft::handle_t& handle, cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/gemm.hpp -======= -template -void gemm(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - cudaStream_t stream) -{ - math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm( - handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); -} - -/** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible - * combinations of operand layouts. - * It computes the following equation: Z = alpha . X * Y + beta . Z - * @tparam T Data type of input/output matrices (float/double) - * @param handle raft handle - * @param z output matrix of size M rows x N columns - * @param x input matrix of size M rows x K columns - * @param y input matrix of size K rows x N columns - * @param _M number of rows of X and Z - * @param _N number of rows of Y and columns of Z - * @param _K number of columns of X and rows of Y - * @param isZColMajor Storage layout of Z. true = col major, false = row major - * @param isXColMajor Storage layout of X. true = col major, false = row major - * @param isYColMajor Storage layout of Y. true = col major, false = row major - * @param stream cuda stream - * @param alpha scalar - * @param beta scalar - */ ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/gemm.cuh template void gemm(const raft::handle_t& handle, T* z, diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index dc75b70509..854b2333d6 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -16,7 +16,7 @@ #pragma once -//for cmath: +// for cmath: #define _USE_MATH_DEFINES #include @@ -41,14 +41,14 @@ namespace spectral { namespace detail { // curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - float *outputPtr, size_t n, - float mean, float stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ return curandGenerateNormal(generator, outputPtr, n, mean, stddev); } -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - double *outputPtr, size_t n, - double mean, double stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); } @@ -56,7 +56,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/** +/** * @brief Perform Lanczos iteration * Lanczos iteration is performed on a shifted matrix A+shift*I. * @tparam index_type_t the type of data used for indexing. @@ -86,25 +86,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t *iter, index_type_t maxIter, value_type_t shift, - value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev) { +int performLanczosIteration(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); @@ -118,29 +123,28 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, - stream)); + CUBLAS_CHECK(cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, - beta_host, stream)); + CUBLAS_CHECK(cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), - 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -152,65 +156,121 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, - lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(value_type_t), cudaMemcpyDeviceToHost, + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, n, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, - alpha_host + (*iter - 1), stream)); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, - beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -218,7 +278,7 @@ int performLanczosIteration( return 0; } -/** +/** * @brief Find Householder transform for 3-dimensional system * Given an input vector v=[x,y,z]', this function finds a * Householder transform P such that P*v is a multiple of @@ -236,8 +296,8 @@ int performLanczosIteration( * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(value_type_t *v, value_type_t *Pv, - value_type_t *P) { +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -247,8 +307,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, v[0] -= *Pv; // Normalize Householder vector - value_type_t normHouseholder = - std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -262,11 +321,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, // Construct Householder matrix index_type_t i, j; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; } -/** +/** * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The @@ -278,7 +339,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const value_type_t *v, value_type_t *A) { +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ // Loop indices index_type_t i, j; // Dot product between Householder vector and matrix row/column @@ -287,19 +349,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Pre-apply Householder transform for (j = 0; j < 4; ++j) { vDotA = 0; - for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; } // Post-apply Householder transform for (i = 0; i < 4; ++i) { vDotA = 0; - for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; } } -/** +/** * @brief Perform one step of Francis QR algorithm * Equivalent to two steps of the classical QR algorithm on a * tridiagonal matrix. @@ -320,10 +386,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(index_type_t n, value_type_t shift1, - value_type_t shift2, value_type_t *alpha, - value_type_t *beta, value_type_t *V, - value_type_t *work) { +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -353,30 +423,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, - 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = beta[pos + 3]; @@ -386,22 +456,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); + findHouseholder3(householder, beta + pos, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = 0; @@ -409,37 +479,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); + findHouseholder3(householder, beta + n - 4, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } -/** +/** * @brief Perform implicit restart of Lanczos algorithm * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. * @tparam index_type_t the type of data used for indexing. @@ -475,23 +544,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @return error flag. */ template -static int lanczosRestart( - handle_t const &handle, index_type_t n, index_type_t iter, - index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, bool smallest_eig) { +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Loop index index_type_t i; @@ -502,12 +578,12 @@ static int lanczosRestart( index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - value_type_t *ritzVals_host = work_host + 3 * iter; + value_type_t* ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - value_type_t *shifts_host; + value_type_t* shifts_host; // Orthonormal matrix for similarity transform - value_type_t *V_dev = work_dev + n * iter; + value_type_t* V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation @@ -525,7 +601,8 @@ static int lanczosRestart( // Initialize similarity transform with identity matrix memset(V_host, 0, iter * iter * sizeof(value_type_t)); - for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1; + for (i = 0; i < iter; ++i) + V_host[IDX(i, i, iter)] = 1; // Determine interval to suppress eigenvalues if (smallest_eig) { @@ -549,49 +626,71 @@ static int lanczosRestart( // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = - cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, - beta_host, V_host, work_host)) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = - beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, - n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), 1, stream)); + CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, - &one, lanczosVecs_dev, n, V_dev, iter, &zero, - work_dev, n, stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, - beta_host + iter_new - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, - lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -602,19 +701,28 @@ static int lanczosRestart( namespace detail { template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *shift, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace raft::spectral::detail; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -634,21 +742,20 @@ int computeSmallestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -661,12 +768,11 @@ int computeSmallestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -689,10 +795,18 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -707,9 +821,17 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -726,9 +848,19 @@ int computeSmallestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -737,9 +869,17 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -750,32 +890,52 @@ int computeSmallestEigenvectors( } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); - CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); @@ -783,20 +943,25 @@ int computeSmallestEigenvectors( } template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ using namespace raft::spectral::detail; // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -806,8 +971,8 @@ int computeSmallestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -815,29 +980,50 @@ int computeSmallestEigenvectors( // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = raft::detail::computeSmallestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = raft::detail::computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; } template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace raft::spectral::detail; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -853,8 +1039,8 @@ int computeLargestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled @@ -864,15 +1050,14 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -885,12 +1070,11 @@ int computeLargestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -910,13 +1094,21 @@ int computeLargestEigenvectors( CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; value_type_t shift_val = 0.0; - value_type_t *shift = &shift_val; - - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -933,9 +1125,19 @@ int computeLargestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -944,9 +1146,17 @@ int computeLargestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -956,15 +1166,18 @@ int computeLargestEigenvectors( WARNING("implicitly restarted Lanczos failed to converge"); } for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -989,29 +1202,45 @@ int computeLargestEigenvectors( //} // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync( - eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); @@ -1019,18 +1248,23 @@ int computeLargestEigenvectors( } template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1040,18 +1274,30 @@ int computeLargestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method index_type_t effIter; - int status = raft::detail::computeLargestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = raft::detail::computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 0e649fb937..7f1ba3da0d 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -25,21 +25,18 @@ namespace raft { namespace linalg { namespace detail { -template -__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, - Args... args) { +template +__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +{ auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - out[idx] = map(in[idx], args[idx]...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -template -void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +template +void mapImpl( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index a7031bc48f..089bc627be 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -1,18 +1,18 @@ /* -* Copyright (c) 2021, NVIDIA CORPORATION. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once @@ -25,50 +25,66 @@ namespace raft { namespace linalg { namespace detail { -struct sum_tag {}; +struct sum_tag { +}; template -__device__ void reduce(OutType *out, const InType acc, sum_tag) { +__device__ void reduce(OutType* out, const InType acc, sum_tag) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Sum(acc); - if (threadIdx.x == 0) { - raft::myAtomicAdd(out, tmp); - } + if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } } template -__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { +__device__ void reduce(OutType* out, const InType acc, ReduceLambda op) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); - if (threadIdx.x == 0) { - raft::myAtomicReduce(out, tmp, op); - } + if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); } } -template -__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, - MapOp map, ReduceLambda op, - const InType *in, Args... args) { +template +__global__ void mapThenReduceKernel(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) +{ OutType acc = neutral; - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - acc = map(in[idx], args[idx]...); - } + if (idx < len) { acc = map(in[idx], args[idx]...); } __syncthreads(); reduce(out, acc, op); } -template -void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduceImpl(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index e8956521df..8fdee6d30e 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -103,42 +103,10 @@ __global__ void stridedReductionKernel(OutType* dots, raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op); } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/strided_reduction.cuh -template , -======= -/** - * @brief Compute reduction of the input matrix along the strided dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D leading dimension of data - * @param N second dimension data - * @param init initial value to use for the reduction - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results - * @param inplace reduction result added inplace or overwrites old values? - * @param stream cuda stream where to launch work - */ template , ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/strided_reduction.cuh typename ReduceLambda = raft::Sum, typename FinalLambda = raft::Nop> void stridedReduction(OutType* dots, diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index a58888a24f..a2e91a381a 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -25,20 +25,23 @@ namespace linalg { namespace detail { template -__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - //TODO: kernel do not use shared memory in current implementation +__global__ void subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] - *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } } template -void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh index 691bef4d35..0d9cbc05dc 100644 --- a/cpp/include/raft/linalg/detail/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.cuh @@ -145,8 +145,7 @@ void svdEig(const raft::handle_t& handle, beta, stream); - raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, - stream); + raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); raft::matrix::colReverse(V, n_cols, n_cols, stream); raft::matrix::rowReverse(S, n_cols, 1, stream); @@ -239,67 +238,6 @@ void svdJacobi(const raft::handle_t& handle, RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/svd.cuh -======= -/** - * @brief reconstruct a matrix use left and right singular vectors and - * singular values - * @param handle: raft handle - * @param U: left singular vectors of size n_rows x k - * @param S: square matrix with singular values on its diagonal, k x k - * @param V: right singular vectors of size n_cols x k - * @param out: reconstructed matrix to be returned - * @param n_rows: number rows of output matrix - * @param n_cols: number columns of output matrix - * @param k: number of singular values - * @param stream cuda stream - */ -template -void svdReconstruction(const raft::handle_t& handle, - math_t* U, - math_t* S, - math_t* V, - math_t* out, - int n_rows, - int n_cols, - int k, - cudaStream_t stream) -{ - const math_t alpha = 1.0, beta = 0.0; - rmm::device_uvector SVT(k * n_cols, stream); - - raft::linalg::gemm( - handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, - U, - n_rows, - k, - SVT.data(), - out, - n_rows, - n_cols, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); -} - -/** - * @brief reconstruct a matrix use left and right singular vectors and - * singular values - * @param handle: raft handle - * @param A_d: input matrix - * @param U: left singular vectors of size n_rows x k - * @param S_vec: singular values as a vector - * @param V: right singular vectors of size n_cols x k - * @param n_rows: number rows of output matrix - * @param n_cols: number columns of output matrix - * @param k: number of singular values to be computed, 1.0 for normal SVD - * @param tol: tolerance for the evaluation - * @param stream cuda stream - */ ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/svd.cuh template bool evaluateSVDByL2Norm(const raft::handle_t& handle, math_t* A_d, diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh index 5e93157ed7..0089400201 100644 --- a/cpp/include/raft/linalg/detail/unary_op.cuh +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -51,55 +51,12 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr RAFT_CUDA_TRY(cudaPeekAtLastError()); } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/unary_op.cuh -template -void unaryOpCaller(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; //silently skip in case of 0 length input - constexpr auto maxSize = - sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && - outAddr % 16 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && - outAddr % 8 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && - outAddr % 4 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && - outAddr % 2 == 0) { - unaryOpImpl( - out, in, len, op, stream); -======= -/** - * @brief perform element-wise unary operation in the input array - * @tparam InType input data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam OutType output data-type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in the input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - * @note Lambda must be a functor with the following signature: - * `OutType func(const InType& val);` - */ template -void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) { if (len <= 0) return; // silently skip in case of 0 length input constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); @@ -114,7 +71,6 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_ unaryOpImpl(out, in, len, op, stream); } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { unaryOpImpl(out, in, len, op, stream); ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/unary_op.cuh } else if (1 / maxSize) { unaryOpImpl(out, in, len, op, stream); } else { @@ -129,33 +85,9 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) if (idx < len) { op(out + idx, idx); } } -<<<<<<< HEAD:cpp/include/raft/linalg/detail/unary_op.cuh -template -void writeOnlyUnaryOpCaller(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { -======= -/** - * @brief Perform an element-wise unary operation into the output array - * - * Compared to `unaryOp()`, this method does not do any reads from any inputs - * - * @tparam OutType output data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * - * @param[out] out the output array [on device] [len = len] - * @param[in] len number of elements in the input array - * @param[in] op the device-lambda which must be of the form: - * `void func(OutType* outLocationOffset, IdxType idx);` - * where outLocationOffset will be out + idx. - * @param[in] stream cuda stream where to launch work - */ template -void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream) { ->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/unary_op.cuh if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); writeOnlyUnaryOpKernel<<>>(out, len, op); diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp index e4eead777c..ecf0d3a48d 100644 --- a/cpp/include/raft/linalg/divide.hpp +++ b/cpp/include/raft/linalg/divide.hpp @@ -36,8 +36,8 @@ using detail::divides_scalar; * @{ */ template -void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp(out, in, len, divides_scalar(scalar), stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp index 288b43f27f..91a475f25f 100644 --- a/cpp/include/raft/linalg/eig.hpp +++ b/cpp/include/raft/linalg/eig.hpp @@ -35,9 +35,14 @@ namespace linalg { * @{ */ template -void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, - std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream) { +void eigDC(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); } @@ -62,11 +67,17 @@ using detail::OVERWRITE_INPUT; * @{ */ template -void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, - EigVecMemUsage memUsage, cudaStream_t stream) { - detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, - eig_vals, memUsage, stream); +void eigSelDC(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + int n_eig_vals, + math_t* eig_vectors, + math_t* eig_vals, + EigVecMemUsage memUsage, + cudaStream_t stream) +{ + detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream); } #endif @@ -86,11 +97,17 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @{ */ template -void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, - int n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, - tol, sweeps); +void eigJacobi(const raft::handle_t& handle, + const math_t* in, + int n_rows, + int n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream, + math_t tol = 1.e-7, + int sweeps = 15) +{ + detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp index 63b824e6f7..5a5b5c647b 100644 --- a/cpp/include/raft/linalg/eltwise.hpp +++ b/cpp/include/raft/linalg/eltwise.hpp @@ -38,19 +38,17 @@ using detail::adds_scalar; * @{ */ template -void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { - raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), - stream); +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), stream); } using detail::multiplies_scalar; template -void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { - raft::linalg::unaryOp(out, in, len, - multiplies_scalar(scalar), stream); +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, multiplies_scalar(scalar), stream); } /** @} */ @@ -66,34 +64,39 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, * @{ */ template -void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp(out, in1, in2, len, thrust::plus(), stream); } template -void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp(out, in1, in2, len, thrust::minus(), stream); } template -void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp(out, in1, in2, len, thrust::multiplies(), stream); } template -void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp(out, in1, in2, len, thrust::divides(), stream); } using detail::divides_check_zero; template -void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp(out, in1, in2, len, divides_check_zero(), stream); } /** @} */ diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp index 9326714a41..624aa7232b 100644 --- a/cpp/include/raft/linalg/gemm.hpp +++ b/cpp/include/raft/linalg/gemm.hpp @@ -40,27 +40,45 @@ namespace linalg { * @param stream cuda stream */ template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, - math_t beta, cudaStream_t stream) { - detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, - trans_b, alpha, beta, stream); +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ + detail::gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, - cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, - trans_b, alpha, beta, stream); + math_t beta = math_t(0); + gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } /** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * @brief A wrapper for CUBLS GEMM function designed for handling all possible * combinations of operand layouts. * It computes the following equation: Z = alpha . X * Y + beta . Z * @tparam T Data type of input/output matrices (float/double) @@ -79,11 +97,22 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, * @param beta scalar */ template -void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, - int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, - cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { - detail::gemm(handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, - isYColMajor, stream, alpha, beta); +void gemm(const raft::handle_t& handle, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + T alpha = T(1.0), + T beta = T(0.0)) +{ + detail::gemm( + handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 87ffb76163..34db473edb 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -74,19 +74,41 @@ namespace raft { * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *shift, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { - return raft::detail::computeSmallestEigenvectors( - handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, - totalIter, shift, alpha_host, beta_host, lanczosVecs_dev, work_dev, - eigVals_dev, eigVecs_dev, seed); +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ + return raft::detail::computeSmallestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + effIter, + totalIter, + shift, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev, + eigVals_dev, + eigVecs_dev, + seed); } /** @@ -127,15 +149,29 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { - return raft::detail::computeSmallestEigenvectors( - handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, - eigVals_dev, eigVecs_dev, seed); +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ + return raft::detail::computeSmallestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); } // ========================================================= @@ -187,19 +223,39 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { - return raft::detail::computeLargestEigenvectors( - handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, - totalIter, alpha_host, beta_host, lanczosVecs_dev, work_dev, eigVals_dev, - eigVecs_dev, seed); +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ + return raft::detail::computeLargestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + effIter, + totalIter, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev, + eigVals_dev, + eigVecs_dev, + seed); } /** @@ -240,15 +296,29 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { - return raft::detail::computeLargestEigenvectors( - handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, - eigVals_dev, eigVecs_dev, seed); +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ + return raft::detail::computeLargestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); } } // namespace raft diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp index 71ac959f77..c14fb7ba2b 100644 --- a/cpp/include/raft/linalg/map.hpp +++ b/cpp/include/raft/linalg/map.hpp @@ -41,10 +41,9 @@ template -void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - detail::mapImpl(out, len, map, stream, - in, args...); +void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + detail::mapImpl(out, len, map, stream, in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp index 149c2401f5..d4d7087339 100644 --- a/cpp/include/raft/linalg/map_then_reduce.hpp +++ b/cpp/include/raft/linalg/map_then_reduce.hpp @@ -40,11 +40,11 @@ template -void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - detail::mapThenReduceImpl(out, len, (OutType)0, map, - detail::sum_tag(), stream, in, args...); +void mapThenSumReduce( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + detail::mapThenReduceImpl( + out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...); } /** @@ -65,11 +65,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ detail::mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp index 2cfaa0564c..f088ef4dce 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.hpp +++ b/cpp/include/raft/linalg/matrix_vector_op.hpp @@ -45,11 +45,17 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, - cudaStream_t stream) { - detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, - stream); +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } /** @@ -77,11 +83,18 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, - bcastAlongRows, op, stream); +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp index ad19b361c2..b0e9eed5e2 100644 --- a/cpp/include/raft/linalg/qr.hpp +++ b/cpp/include/raft/linalg/qr.hpp @@ -37,8 +37,13 @@ namespace linalg { * @{ */ template -void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, - int n_rows, int n_cols, cudaStream_t stream) { +void qrGetQ(const raft::handle_t& handle, + const math_t* M, + math_t* Q, + int n_rows, + int n_cols, + cudaStream_t stream) +{ detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream); } @@ -53,8 +58,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, * @param stream cuda stream */ template -void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, - int n_rows, int n_cols, cudaStream_t stream) { +void qrGetQR(const raft::handle_t& handle, + math_t* M, + math_t* Q, + math_t* R, + int n_rows, + int n_cols, + cudaStream_t stream) +{ detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp index 3935e648dc..7e2b5229ec 100644 --- a/cpp/include/raft/linalg/strided_reduction.hpp +++ b/cpp/include/raft/linalg/strided_reduction.hpp @@ -48,17 +48,24 @@ namespace linalg { * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void stridedReduction(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { - detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + FinalLambda final_op = raft::Nop()) +{ + detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp index 820a29cb33..88946646c8 100644 --- a/cpp/include/raft/linalg/subtract.hpp +++ b/cpp/include/raft/linalg/subtract.hpp @@ -62,7 +62,8 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream binaryOp(out, in1, in2, len, op, stream); } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -73,9 +74,12 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream * @remark block size has not been tuned */ template -void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream); } diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp index 970c339090..62ac19b592 100644 --- a/cpp/include/raft/linalg/svd.hpp +++ b/cpp/include/raft/linalg/svd.hpp @@ -40,18 +40,42 @@ namespace linalg { // TODO: couldn't template this function due to cusolverDnSgesvd and // cusolverSnSgesvd. Check if there is any other way. template -void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, - T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, - bool trans_right, bool gen_left_vec, bool gen_right_vec, - cudaStream_t stream) { - detail::svdQR(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, - right_sing_vecs, trans_right, gen_left_vec, gen_right_vec, +void svdQR(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* sing_vals, + T* left_sing_vecs, + T* right_sing_vecs, + bool trans_right, + bool gen_left_vec, + bool gen_right_vec, + cudaStream_t stream) +{ + detail::svdQR(handle, + in, + n_rows, + n_cols, + sing_vals, + left_sing_vecs, + right_sing_vecs, + trans_right, + gen_left_vec, + gen_right_vec, stream); } template -void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, - T *U, T *V, bool gen_left_vec, cudaStream_t stream) { +void svdEig(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* S, + T* U, + T* V, + bool gen_left_vec, + cudaStream_t stream) +{ detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream); } @@ -73,13 +97,31 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, * @param stream cuda stream */ template -void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - math_t *sing_vals, math_t *left_sing_vecs, - math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, - math_t tol, int max_sweeps, cudaStream_t stream) { - detail::svdJacobi(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, - right_sing_vecs, gen_left_vec, gen_right_vec, tol, - max_sweeps, stream); +void svdJacobi(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + math_t* sing_vals, + math_t* left_sing_vecs, + math_t* right_sing_vecs, + bool gen_left_vec, + bool gen_right_vec, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ + detail::svdJacobi(handle, + in, + n_rows, + n_cols, + sing_vals, + left_sing_vecs, + right_sing_vecs, + gen_left_vec, + gen_right_vec, + tol, + max_sweeps, + stream); } /** @@ -96,16 +138,34 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @param stream cuda stream */ template -void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, - math_t *V, math_t *out, int n_rows, int n_cols, int k, - cudaStream_t stream) { +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ const math_t alpha = 1.0, beta = 0.0; rmm::device_uvector SVT(k * n_cols, stream); - raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, - CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::linalg::gemm( + handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, + U, + n_rows, + k, + SVT.data(), + out, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); } /** @@ -123,11 +183,18 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, * @param stream cuda stream */ template -bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, - math_t *S_vec, math_t *V, int n_rows, int n_cols, - int k, math_t tol, cudaStream_t stream) { - return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, - k, tol, stream); +bool evaluateSVDByL2Norm(const raft::handle_t& handle, + math_t* A_d, + math_t* U, + math_t* S_vec, + math_t* V, + int n_rows, + int n_cols, + int k, + math_t tol, + cudaStream_t stream) +{ + return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp index 13795f9297..c54e3cc1c3 100644 --- a/cpp/include/raft/linalg/unary_op.hpp +++ b/cpp/include/raft/linalg/unary_op.hpp @@ -36,10 +36,13 @@ namespace linalg { * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val);` */ -template -void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { +template +void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ detail::unaryOpCaller(out, in, len, op, stream); } @@ -60,10 +63,9 @@ void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, * where outLocationOffset will be out + idx. * @param[in] stream cuda stream where to launch work */ -template -void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { +template +void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ detail::writeOnlyUnaryOpCaller(out, len, op, stream); } diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index b91516279a..8fbd68f0a6 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -24,13 +24,8 @@ #include #include #include -<<<<<<< HEAD #include -#include -======= -#include #include ->>>>>>> upstream/branch-22.02 #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index efb8d0201d..6cd0e3154d 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh index 27a23034c5..e65c79b5bd 100644 --- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -17,7 +17,7 @@ #include #include #include -#include +#include // TODO: Need to hide the PairwiseDistance class impl and expose to public API #include #include diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index 8d19c61b63..ae95fac0b2 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -83,9 +83,15 @@ class EigSelTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream); - raft::linalg::eigSelDC(handle, cov_matrix.data(), params.n_row, - params.n_col, 3, eig_vectors.data(), eig_vals.data(), - EigVecMemUsage::OVERWRITE_INPUT, stream); + raft::linalg::eigSelDC(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + 3, + eig_vectors.data(), + eig_vals.data(), + EigVecMemUsage::OVERWRITE_INPUT, + stream); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } From db817f62a6e8cdc6b65bdee18829a210d071e8a4 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Dec 2021 13:19:52 -0800 Subject: [PATCH 07/17] changing h extensions to hpp --- cpp/include/raft/distance/detail/distance.cuh | 2 +- cpp/include/raft/distance/distance.hpp | 2 +- cpp/include/raft/handle.hpp | 4 ++-- cpp/include/raft/label/merge_labels.cuh | 2 +- .../raft/linalg/{cublas_wrappers.h => cublas_wrappers.hpp} | 0 .../linalg/{cusolver_wrappers.h => cusolver_wrappers.hpp} | 0 cpp/include/raft/linalg/detail/cholesky_r1_update.hpp | 4 ++-- cpp/include/raft/linalg/detail/eig.hpp | 2 +- cpp/include/raft/linalg/detail/gemm.hpp | 2 +- cpp/include/raft/linalg/detail/lanczos.hpp | 2 +- cpp/include/raft/linalg/detail/qr.cuh | 4 ++-- cpp/include/raft/linalg/detail/svd.cuh | 6 +++--- .../raft/linalg/{distance_type.h => distance_type.hpp} | 0 cpp/include/raft/linalg/{gemv.h => gemv.hpp} | 2 +- cpp/include/raft/linalg/{init.h => init.hpp} | 0 cpp/include/raft/linalg/{transpose.h => transpose.hpp} | 2 +- cpp/include/raft/matrix/matrix.hpp | 2 +- cpp/include/raft/sparse/distance/detail/bin_distance.cuh | 2 +- cpp/include/raft/sparse/distance/detail/ip_distance.cuh | 2 +- cpp/include/raft/sparse/distance/detail/l2_distance.cuh | 2 +- cpp/include/raft/sparse/distance/detail/lp_distance.cuh | 2 +- cpp/include/raft/sparse/distance/distance.hpp | 2 +- cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh | 2 +- cpp/include/raft/sparse/selection/detail/knn.cuh | 2 +- cpp/include/raft/sparse/selection/detail/knn_graph.cuh | 2 +- cpp/include/raft/sparse/selection/knn.hpp | 2 +- cpp/include/raft/sparse/selection/knn_graph.hpp | 2 +- cpp/include/raft/spatial/knn/ann_common.h | 2 +- cpp/include/raft/spatial/knn/ball_cover.hpp | 2 +- cpp/include/raft/spatial/knn/ball_cover_common.h | 2 +- cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh | 2 +- cpp/include/raft/spatial/knn/detail/common_faiss.h | 2 +- cpp/include/raft/spatial/knn/detail/haversine_distance.cuh | 2 +- .../raft/spatial/knn/detail/knn_brute_force_faiss.cuh | 2 +- cpp/include/raft/spatial/knn/detail/processing.hpp | 2 +- cpp/include/raft/spectral/kmeans.hpp | 2 +- cpp/include/raft/spectral/lapack.hpp | 4 ++-- cpp/include/raft/spectral/matrix_wrappers.hpp | 2 +- cpp/test/linalg/cholesky_r1.cu | 2 +- cpp/test/linalg/gemv.cu | 2 +- cpp/test/linalg/reduce.cuh | 2 +- cpp/test/linalg/transpose.cu | 2 +- cpp/test/sparse/connect_components.cu | 4 ++-- cpp/test/sparse/dist_coo_spmv.cu | 2 +- cpp/test/sparse/distance.cu | 2 +- cpp/test/sparse/knn.cu | 2 +- cpp/test/sparse/linkage.cu | 4 ++-- cpp/test/spatial/ball_cover.cu | 2 +- cpp/test/spatial/fused_l2_knn.cu | 2 +- cpp/test/spatial/haversine.cu | 2 +- cpp/test/spatial/knn.cu | 2 +- 51 files changed, 55 insertions(+), 55 deletions(-) rename cpp/include/raft/linalg/{cublas_wrappers.h => cublas_wrappers.hpp} (100%) rename cpp/include/raft/linalg/{cusolver_wrappers.h => cusolver_wrappers.hpp} (100%) rename cpp/include/raft/linalg/{distance_type.h => distance_type.hpp} (100%) rename cpp/include/raft/linalg/{gemv.h => gemv.hpp} (99%) rename cpp/include/raft/linalg/{init.h => init.hpp} (100%) rename cpp/include/raft/linalg/{transpose.h => transpose.hpp} (98%) diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index 9eeccdb827..a004d24ae8 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include namespace raft { diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp index 66832c12d2..745d1fea90 100644 --- a/cpp/include/raft/distance/distance.hpp +++ b/cpp/include/raft/distance/distance.hpp @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include #include namespace raft { diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index bba7fabc54..fb45fa13a8 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -32,10 +32,10 @@ ///@todo: enable once we have migrated cuml-comms layer too //#include -#include -#include #include #include +#include +#include #include #include #include "cudart_utils.h" diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index 9818b5d71b..33413fafe5 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -20,8 +20,8 @@ #include #include -#include #include +#include namespace raft { namespace label { diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.hpp similarity index 100% rename from cpp/include/raft/linalg/cublas_wrappers.h rename to cpp/include/raft/linalg/cublas_wrappers.hpp diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.hpp similarity index 100% rename from cpp/include/raft/linalg/cusolver_wrappers.h rename to cpp/include/raft/linalg/cusolver_wrappers.hpp diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp index db00c5d6fc..641b38ff40 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp @@ -16,11 +16,11 @@ #pragma once -#include -#include #include #include #include +#include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index 704fe339dc..6475ce969b 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include #include #include diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 8a74e78a79..0954097b80 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -17,9 +17,9 @@ #pragma once #include -#include #include #include +#include namespace raft { namespace linalg { diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index 854b2333d6..f7052eef14 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -26,8 +26,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh index 5ca9850900..8dc46eeb9b 100644 --- a/cpp/include/raft/linalg/detail/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include #include #include diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh index 0d9cbc05dc..81bfa06f27 100644 --- a/cpp/include/raft/linalg/detail/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.cuh @@ -17,13 +17,13 @@ #pragma once #include -#include -#include -#include #include #include +#include +#include #include #include +#include #include #include #include diff --git a/cpp/include/raft/linalg/distance_type.h b/cpp/include/raft/linalg/distance_type.hpp similarity index 100% rename from cpp/include/raft/linalg/distance_type.h rename to cpp/include/raft/linalg/distance_type.hpp diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.hpp similarity index 99% rename from cpp/include/raft/linalg/gemv.h rename to cpp/include/raft/linalg/gemv.hpp index 965cd32a57..7dfd1f1db1 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.hpp @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include #include diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.hpp similarity index 100% rename from cpp/include/raft/linalg/init.h rename to cpp/include/raft/linalg/init.hpp diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.hpp similarity index 98% rename from cpp/include/raft/linalg/transpose.h rename to cpp/include/raft/linalg/transpose.hpp index 63dbae1c8a..09e9e67e7b 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.hpp @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include namespace raft { diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index a7a43cff6e..00651a9b62 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -21,10 +21,10 @@ #include #include #include -#include #include #include #include +#include namespace raft { namespace matrix { diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index ad97e0853a..141e5b3e5f 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -19,11 +19,11 @@ #include #include -#include #include #include #include #include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index 03c13df511..0f8b2d99bb 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 8fbd68f0a6..62bfb7671e 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -19,11 +19,11 @@ #include #include -#include #include #include #include #include +#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index 5be9de97c3..d062705b57 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -19,9 +19,9 @@ #include #include -#include #include #include +#include #include diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index 9b708f4b27..c49730bdb9 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -19,9 +19,9 @@ #include #include -#include #include #include +#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index fdd03a5faa..0c47b22201 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -23,8 +23,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index 6cd0e3154d..21a40cf626 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh index 83cb23f513..c96fefdc5d 100644 --- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh @@ -24,7 +24,7 @@ #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp index 141026dc82..bfc0c14a8c 100644 --- a/cpp/include/raft/sparse/selection/knn.hpp +++ b/cpp/include/raft/sparse/selection/knn.hpp @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include namespace raft { diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp index 7af452541f..2a3159900c 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.hpp +++ b/cpp/include/raft/sparse/selection/knn_graph.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 573a23181d..e2df51a62b 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp index cb2b9e99cd..4495221a34 100644 --- a/cpp/include/raft/spatial/knn/ball_cover.hpp +++ b/cpp/include/raft/spatial/knn/ball_cover.hpp @@ -18,8 +18,8 @@ #include -#include #include +#include #include "ball_cover_common.h" #include "detail/ball_cover.cuh" #include "detail/ball_cover/common.cuh" diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h index e38124edb6..9ed1d2f726 100644 --- a/cpp/include/raft/spatial/knn/ball_cover_common.h +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include #include namespace raft { diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index b7f124c51e..6f223fdb43 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -43,7 +43,7 @@ #include -#include +#include #include diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 5618186dfc..3708523b4f 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -20,7 +20,7 @@ #include #include -#include +#include namespace raft { namespace spatial { diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 049c11514c..50340a284b 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -26,8 +26,8 @@ #include #include -#include #include +#include namespace raft { namespace spatial { diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 12b7124773..54509b4a51 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -29,11 +29,11 @@ #include #include -#include #include #include #include #include +#include #include #include "fused_l2_knn.cuh" diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 905e797841..5a4672e711 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include #include diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 549dd4917c..cbd0486086 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -29,9 +29,9 @@ #include #include -#include #include #include +#include #include #include diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 35fc22c770..a47c41564c 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -17,9 +17,9 @@ #pragma once #include -#include -#include #include +#include +#include // for now; TODO: check if/where this `define` should be; // diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 0d79904707..6f9d383c63 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -16,9 +16,9 @@ #pragma once #include -#include #include #include +#include #include #include diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index b93b0b90e9..85f5d7ada1 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -16,9 +16,9 @@ #include #include -#include #include #include +#include #include #include diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu index 962b17fa24..580effbe50 100644 --- a/cpp/test/linalg/gemv.cu +++ b/cpp/test/linalg/gemv.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include #include "../test_utils.h" diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index dfef1cf054..aae57e136e 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include #include #include diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index a63b08e970..01962fcd23 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -16,8 +16,8 @@ #include #include -#include #include +#include #include #include "../test_utils.h" diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index df138e2bdb..2c56a902d4 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -26,8 +26,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 9701ec3259..eae8fec500 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include #include diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index f4f346561c..d635c4a813 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 389e8c4b9c..6e4de63e4d 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -17,8 +17,8 @@ #include #include -#include #include +#include #include #include "../test_utils.h" diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 81e6dc4768..51947167cf 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -17,8 +17,8 @@ #include "../test_utils.h" #include -#include -#include +#include +#include #include #include diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index 73c0f87fdd..d63674c13c 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index 078d5e0eec..303844b0a4 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index 171b698265..d28fd55dbe 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include #include #include diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 2fb9bd2ca5..839d60095e 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -16,7 +16,7 @@ #include "../test_utils.h" -#include +#include #include #include From abec4d24f444cc1541c5e8d15210fee22d64bc58 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 21 Dec 2021 18:34:59 -0800 Subject: [PATCH 08/17] cublas/cusolver only in detail, wrap up rest of linalg --- cpp/include/raft/distance/detail/distance.cuh | 2 +- cpp/include/raft/distance/distance.hpp | 2 +- .../{linalg => distance}/distance_type.hpp | 2 +- cpp/include/raft/handle.hpp | 4 +- cpp/include/raft/linalg/add.hpp | 10 +- cpp/include/raft/linalg/binary_op.hpp | 2 +- .../raft/linalg/cholesky_r1_update.hpp | 2 +- .../raft/linalg/coalesced_reduction.hpp | 2 +- cpp/include/raft/linalg/detail/add.cuh | 16 +++ .../raft/linalg/detail/cholesky_r1_update.hpp | 39 +++-- .../linalg/{ => detail}/cublas_wrappers.hpp | 41 +----- .../linalg/{ => detail}/cusolver_wrappers.hpp | 6 +- cpp/include/raft/linalg/detail/divide.hpp | 34 +++++ cpp/include/raft/linalg/detail/eig.hpp | 6 +- cpp/include/raft/linalg/detail/eltwise.hpp | 77 ++++++++++ cpp/include/raft/linalg/detail/gemm.hpp | 21 ++- cpp/include/raft/linalg/detail/gemv.hpp | 117 +++++++++++++++ cpp/include/raft/linalg/detail/init.hpp | 54 +++++++ cpp/include/raft/linalg/detail/lanczos.hpp | 4 +- .../raft/linalg/detail/mean_squared_error.hpp | 38 +++++ cpp/include/raft/linalg/detail/multiply.hpp | 34 +++++ cpp/include/raft/linalg/detail/norm.hpp | 116 +++++++++++++++ cpp/include/raft/linalg/detail/qr.cuh | 13 +- cpp/include/raft/linalg/detail/reduce.hpp | 63 ++++++++ cpp/include/raft/linalg/detail/subtract.cuh | 14 ++ cpp/include/raft/linalg/detail/svd.cuh | 134 ++++++++++-------- cpp/include/raft/linalg/detail/transpose.hpp | 81 +++++++++++ cpp/include/raft/linalg/divide.hpp | 7 +- cpp/include/raft/linalg/eig.hpp | 6 +- cpp/include/raft/linalg/eltwise.hpp | 21 ++- cpp/include/raft/linalg/gemm.hpp | 23 ++- cpp/include/raft/linalg/gemv.hpp | 33 +---- cpp/include/raft/linalg/init.hpp | 18 +-- cpp/include/raft/linalg/lanczos.hpp | 2 +- cpp/include/raft/linalg/map.hpp | 2 +- cpp/include/raft/linalg/map_then_reduce.hpp | 2 +- cpp/include/raft/linalg/matrix_vector_op.hpp | 2 +- .../raft/linalg/mean_squared_error.hpp | 10 +- cpp/include/raft/linalg/multiply.hpp | 7 +- cpp/include/raft/linalg/norm.hpp | 72 +--------- cpp/include/raft/linalg/qr.hpp | 2 +- cpp/include/raft/linalg/reduce.hpp | 17 +-- cpp/include/raft/linalg/strided_reduction.hpp | 2 +- cpp/include/raft/linalg/subtract.hpp | 8 +- cpp/include/raft/linalg/svd.hpp | 21 +-- cpp/include/raft/linalg/transpose.hpp | 45 +----- cpp/include/raft/linalg/unary_op.hpp | 2 +- cpp/include/raft/matrix/matrix.hpp | 5 +- .../sparse/distance/detail/bin_distance.cuh | 2 +- .../sparse/distance/detail/ip_distance.cuh | 2 +- .../sparse/distance/detail/l2_distance.cuh | 2 +- .../sparse/distance/detail/lp_distance.cuh | 2 +- cpp/include/raft/sparse/distance/distance.hpp | 2 +- .../hierarchy/detail/connectivities.cuh | 2 +- .../raft/sparse/selection/detail/knn.cuh | 2 +- .../sparse/selection/detail/knn_graph.cuh | 2 +- cpp/include/raft/sparse/selection/knn.hpp | 2 +- .../raft/sparse/selection/knn_graph.hpp | 2 +- cpp/include/raft/spatial/knn/ann_common.h | 2 +- cpp/include/raft/spatial/knn/ball_cover.hpp | 2 +- .../raft/spatial/knn/ball_cover_common.h | 2 +- .../knn/detail/ann_quantized_faiss.cuh | 2 +- .../raft/spatial/knn/detail/common_faiss.h | 2 +- .../spatial/knn/detail/haversine_distance.cuh | 2 +- .../knn/detail/knn_brute_force_faiss.cuh | 2 +- .../raft/spatial/knn/detail/processing.hpp | 2 +- cpp/include/raft/spectral/kmeans.hpp | 35 ++--- cpp/include/raft/spectral/lapack.hpp | 4 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 13 +- .../raft/spectral/modularity_maximization.hpp | 4 +- cpp/include/raft/spectral/spectral_util.hpp | 48 ++++--- cpp/test/linalg/cholesky_r1.cu | 24 ++-- cpp/test/linalg/reduce.cuh | 5 +- cpp/test/sparse/connect_components.cu | 2 +- cpp/test/sparse/dist_coo_spmv.cu | 2 +- cpp/test/sparse/distance.cu | 2 +- cpp/test/sparse/knn.cu | 2 +- cpp/test/sparse/linkage.cu | 2 +- cpp/test/spatial/ball_cover.cu | 2 +- cpp/test/spatial/fused_l2_knn.cu | 2 +- cpp/test/spatial/haversine.cu | 2 +- cpp/test/spatial/knn.cu | 2 +- 82 files changed, 967 insertions(+), 461 deletions(-) rename cpp/include/raft/{linalg => distance}/distance_type.hpp (97%) rename cpp/include/raft/linalg/{ => detail}/cublas_wrappers.hpp (95%) rename cpp/include/raft/linalg/{ => detail}/cusolver_wrappers.hpp (99%) create mode 100644 cpp/include/raft/linalg/detail/divide.hpp create mode 100644 cpp/include/raft/linalg/detail/eltwise.hpp create mode 100644 cpp/include/raft/linalg/detail/gemv.hpp create mode 100644 cpp/include/raft/linalg/detail/init.hpp create mode 100644 cpp/include/raft/linalg/detail/mean_squared_error.hpp create mode 100644 cpp/include/raft/linalg/detail/multiply.hpp create mode 100644 cpp/include/raft/linalg/detail/norm.hpp create mode 100644 cpp/include/raft/linalg/detail/reduce.hpp create mode 100644 cpp/include/raft/linalg/detail/transpose.hpp diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index a004d24ae8..45850de115 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp index 745d1fea90..935cf6677a 100644 --- a/cpp/include/raft/distance/distance.hpp +++ b/cpp/include/raft/distance/distance.hpp @@ -17,8 +17,8 @@ #pragma once #include +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/linalg/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp similarity index 97% rename from cpp/include/raft/linalg/distance_type.hpp rename to cpp/include/raft/distance/distance_type.hpp index 681a83f3f8..7a15c97f48 100644 --- a/cpp/include/raft/linalg/distance_type.hpp +++ b/cpp/include/raft/distance/distance_type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index fb45fa13a8..d1b0e35260 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -34,8 +34,8 @@ #include #include -#include -#include +#include +#include #include #include #include "cudart_utils.h" diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp index 2a59339c20..08496eef0d 100644 --- a/cpp/include/raft/linalg/add.hpp +++ b/cpp/include/raft/linalg/add.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,10 +17,6 @@ #pragma once #include "detail/add.cuh" -#include "detail/functional.cuh" - -#include "binary_op.hpp" -#include "unary_op.hpp" namespace raft { namespace linalg { @@ -44,7 +40,7 @@ using detail::adds_scalar; template void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { - unaryOp(out, in, len, adds_scalar(scalar), stream); + detail::addScalar(out, in, scalar, len, stream); } /** @@ -63,7 +59,7 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s template void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) { - binaryOp(out, in1, in2, len, thrust::plus(), stream); + detail::add(out, in1, in2, len, stream); } /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp index e482240b59..12afcbcd9a 100644 --- a/cpp/include/raft/linalg/binary_op.hpp +++ b/cpp/include/raft/linalg/binary_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp index 2428972d85..9dc9630a86 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp index a8f19f61b1..00ac7b4be9 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.hpp +++ b/cpp/include/raft/linalg/coalesced_reduction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index 8459f7924d..7924f11e90 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -16,12 +16,28 @@ #pragma once +#include "functional.cuh" + #include +#include +#include namespace raft { namespace linalg { namespace detail { +template +void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), stream); +} + +template +void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::plus(), stream); +} + template __global__ void add_dev_scalar_kernel(math_t* outDev, const math_t* inDev, diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp index 641b38ff40..45f76660e8 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp @@ -19,8 +19,8 @@ #include #include #include -#include -#include +#include "cublas_wrappers.hpp" +#include "cusolver_wrappers.hpp" namespace raft { namespace linalg { @@ -76,35 +76,32 @@ void choleskyRank1Update(const raft::handle_t& handle, // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; A_new = reinterpret_cast(workspace); - RAFT_CUBLAS_TRY( - raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); + RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(), - CUBLAS_SIDE_LEFT, - uplo, - op, - CUBLAS_DIAG_NON_UNIT, - n - 1, - 1, - &alpha, - L, - ld, - A_new, - n - 1, - stream)); + RAFT_CUBLAS_TRY(cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - RAFT_CUBLAS_TRY( - raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); + RAFT_CUBLAS_TRY(cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - RAFT_CUBLAS_TRY( - raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); + RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); diff --git a/cpp/include/raft/linalg/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp similarity index 95% rename from cpp/include/raft/linalg/cublas_wrappers.hpp rename to cpp/include/raft/linalg/detail/cublas_wrappers.hpp index 024ed4a0e2..5c8779b0cf 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,6 +117,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err) namespace raft { namespace linalg { +namespace detail { /** * @defgroup Axpy cublas ax+y operations @@ -142,7 +143,6 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } @@ -156,7 +156,6 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } /** @} */ @@ -173,7 +172,6 @@ template <> inline cublasStatus_t cublasSwap( cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } @@ -181,7 +179,6 @@ template <> inline cublasStatus_t cublasSwap( cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -199,14 +196,12 @@ template <> inline cublasStatus_t cublasCopy( cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> inline cublasStatus_t cublasCopy( cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } /** @} */ @@ -245,7 +240,6 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } @@ -264,7 +258,6 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -298,7 +291,6 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int lda, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -315,7 +307,6 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int lda, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } /** @} */ @@ -358,7 +349,6 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } @@ -379,7 +369,6 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -425,7 +414,6 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgemmBatched(handle, transa, transb, @@ -462,7 +450,6 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgemmBatched(handle, transa, transb, @@ -529,7 +516,6 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgemmStridedBatched(handle, transa, transb, @@ -572,7 +558,6 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgemmStridedBatched(handle, transa, transb, @@ -619,7 +604,6 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } @@ -633,7 +617,6 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } @@ -662,7 +645,6 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -679,7 +661,6 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -720,7 +701,6 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgelsBatched( handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } @@ -740,7 +720,6 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgelsBatched( handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } @@ -783,7 +762,6 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } @@ -803,7 +781,6 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -844,7 +821,6 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } @@ -864,7 +840,6 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -901,7 +876,6 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -919,7 +893,6 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } /** @} */ @@ -936,7 +909,6 @@ template <> inline cublasStatus_t cublasnrm2( cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } @@ -944,7 +916,6 @@ template <> inline cublasStatus_t cublasnrm2( cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ @@ -979,7 +950,6 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, int ldb, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } @@ -998,7 +968,6 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, int ldb, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } @@ -1026,7 +995,6 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, float* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } @@ -1040,7 +1008,6 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, double* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } /** @} */ @@ -1061,7 +1028,6 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } /** @} */ @@ -1078,7 +1044,6 @@ template <> inline cublasStatus_t cublasscal( cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } @@ -1086,11 +1051,11 @@ template <> inline cublasStatus_t cublasscal( cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } /** @} */ +} // namespace detail } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp similarity index 99% rename from cpp/include/raft/linalg/cusolver_wrappers.hpp rename to cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index 988e7512d5..2ff6825ea9 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,6 +115,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err) namespace raft { namespace linalg { +namespace detail { /** * @defgroup Getrf cusolver getrf operations @@ -441,7 +442,6 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT } /** @} */ -#if CUDART_VERSION >= 10010 /** * @defgroup syevdx cusolver syevdx operations * @{ @@ -575,7 +575,6 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } /** @} */ -#endif /** * @defgroup svd cusolver svd operations @@ -1509,5 +1508,6 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT /** @} */ #endif +} // namespace detail } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp new file mode 100644 index 0000000000..579a3317d6 --- /dev/null +++ b/cpp/include/raft/linalg/detail/divide.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "functional.cuh" + +namespace raft { +namespace linalg { +namespace detail { + +template +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, divides_scalar(scalar), stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index 6475ce969b..859f058441 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -20,10 +20,10 @@ #include #include #include -#include #include #include #include +#include "cusolver_wrappers.hpp" namespace raft { namespace linalg { @@ -137,8 +137,6 @@ void eigDC(const raft::handle_t& handle, enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; -#if CUDART_VERSION >= 10010 - /** * @defgroup eig decomp with divide and conquer method for the column-major * symmetric matrices @@ -244,8 +242,6 @@ void eigSelDC(const raft::handle_t& handle, } } -#endif - template void eigJacobi(const raft::handle_t& handle, const math_t* in, diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp new file mode 100644 index 0000000000..e60c97e0e6 --- /dev/null +++ b/cpp/include/raft/linalg/detail/eltwise.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "functional.cuh" + +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), stream); +} + +template +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, multiplies_scalar(scalar), stream); +} + +template +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::plus(), stream); +} + +template +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::minus(), stream); +} + +template +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::multiplies(), stream); +} + +template +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::divides(), stream); +} + +template +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, divides_check_zero(), stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 0954097b80..43f275ab51 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include "cublas_wrappers.hpp" namespace raft { namespace linalg { @@ -52,6 +52,25 @@ void gemm(const raft::handle_t& handle, cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } +template +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ + math_t alpha = math_t(1); + math_t beta = math_t(0); + gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); +} + template void gemm(const raft::handle_t& handle, T* z, diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp new file mode 100644 index 0000000000..b31fa71237 --- /dev/null +++ b/cpp/include/raft/linalg/detail/gemv.hpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include "cublas_wrappers.hpp" + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows, + const int n_cols, + const math_t* x, + const int incx, + math_t* y, + const int incy, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ + cublasHandle_t cublas_h = handle.get_cublas_handle(); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + RAFT_CUBLAS_TRY( + cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream)); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ + gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ + math_t alpha = math_t(1); + math_t beta = math_t(0); + + gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ + cublasHandle_t cublas_h = handle.get_cublas_handle(); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + RAFT_CUBLAS_TRY( + cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ + math_t alpha = math_t(1); + math_t beta = math_t(0); + gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); +} + +}; // namespace detail +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp new file mode 100644 index 0000000000..9bae9533ea --- /dev/null +++ b/cpp/include/raft/linalg/detail/init.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void range(T* out, int start, int end, cudaStream_t stream) +{ + thrust::counting_iterator first(start); + thrust::counting_iterator last = first + (end - start); + thrust::device_ptr ptr(out); + thrust::copy(rmm::exec_policy(stream), first, last, ptr); +} + +/** + * @brief Like Python range. + * + * Fills the output as out[i] = i. + * + * \param [out] out device array, size [n] + * \param [in] n length of the array + * \param [in] stream cuda stream + */ +template +void range(T* out, int n, cudaStream_t stream) +{ + range(out, 0, n, stream); +} + +} // namespace detail +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index f7052eef14..b08e95c760 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -27,15 +27,15 @@ #include #include -#include #include #include #include +#include "cublas_wrappers.hpp" namespace raft { using namespace matrix; -using namespace linalg; +using namespace linalg::detail; namespace spectral { namespace detail { diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp new file mode 100644 index 0000000000..2ef9479b87 --- /dev/null +++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void meanSquaredError( + math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) +{ + auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { + math_t diff = a - b; + return diff * diff * weight / len; + }; + raft::linalg::mapThenSumReduce(out, len, sq_diff, stream, A, B); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp new file mode 100644 index 0000000000..2cd83920c5 --- /dev/null +++ b/cpp/include/raft/linalg/detail/multiply.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp new file mode 100644 index 0000000000..492f34e59d --- /dev/null +++ b/cpp/include/raft/linalg/detail/norm.hpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +/** different types of norms supported on the input buffers */ +enum NormType { L1Norm = 0, L2Norm }; + +template +void rowNormCaller(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op) +{ + switch (type) { + case L1Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); + break; + case L2Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); + break; + default: ASSERT(false, "Invalid norm type passed! [%d]", type); + }; +} + +template +void colNormCaller(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op) +{ + switch (type) { + case L1Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); + break; + case L2Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); + break; + default: ASSERT(false, "Invalid norm type passed! [%d]", type); + }; +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh index 8dc46eeb9b..0614af4aec 100644 --- a/cpp/include/raft/linalg/detail/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -16,11 +16,11 @@ #pragma once -#include -#include #include #include #include +#include "cublas_wrappers.hpp" +#include "cusolver_wrappers.hpp" namespace raft { namespace linalg { @@ -50,10 +50,7 @@ void qrGetQ(const raft::handle_t& handle, rmm::device_uvector workspace(Lwork, stream); RAFT_CUSOLVER_TRY(cusolverDngeqrf( cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); - /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. -#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif + RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); RAFT_CUSOLVER_TRY(cusolverDnorgqr( @@ -95,10 +92,6 @@ void qrGetQR(const raft::handle_t& handle, Lwork, devInfo.data(), stream)); - // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. -#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp new file mode 100644 index 0000000000..181a7d52b1 --- /dev/null +++ b/cpp/include/raft/linalg/detail/reduce.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void reduce(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + bool rowMajor, + bool alongRows, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + if (rowMajor && alongRows) { + raft::linalg::coalescedReduction( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + } else if (rowMajor && !alongRows) { + raft::linalg::stridedReduction( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + } else if (!rowMajor && alongRows) { + raft::linalg::stridedReduction( + dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + } else { + raft::linalg::coalescedReduction( + dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + } +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index a2e91a381a..26fe258825 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -24,6 +24,20 @@ namespace raft { namespace linalg { namespace detail { +template +void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ + auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; + raft::linalg::unaryOp(out, in, len, op, stream); +} + +template +void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ + auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; + raft::linalg::binaryOp(out, in1, in2, len, op, stream); +} + template __global__ void subtract_dev_scalar_kernel(math_t* outDev, const math_t* inDev, diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh index 81bfa06f27..0d1d128f6f 100644 --- a/cpp/include/raft/linalg/detail/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.cuh @@ -19,8 +19,6 @@ #include #include #include -#include -#include #include #include #include @@ -28,6 +26,8 @@ #include #include #include +#include "cublas_wrappers.hpp" +#include "cusolver_wrappers.hpp" namespace raft { namespace linalg { @@ -49,15 +49,6 @@ void svdQR(const raft::handle_t& handle, cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); -#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 - // 46340: sqrt of max int value - ASSERT(n_rows <= 46340, - "svd solver is not supported for the data that has more than 46340 " - "samples (rows) " - "if you are using CUDA version <11. Please use other solvers such as " - "eig if it is available."); -#endif - const int m = n_rows; const int n = n_cols; @@ -200,44 +191,75 @@ void svdJacobi(const raft::handle_t& handle, int lwork = 0; int econ = 1; - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - &lwork, - gesvdj_params)); + RAFT_CUSOLVER_TRY(cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); rmm::device_uvector d_work(lwork, stream); - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - d_work.data(), - lwork, - devInfo.data(), - gesvdj_params, - stream)); + RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } +template +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ + const math_t alpha = 1.0, beta = 0.0; + rmm::device_uvector SVT(k * n_cols, stream); + + raft::linalg::gemm( + handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, + U, + n_rows, + k, + SVT.data(), + out, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); +} + template bool evaluateSVDByL2Norm(const raft::handle_t& handle, math_t* A_d, @@ -275,20 +297,20 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle, rmm::device_uvector A_minus_P(m * n, stream); RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH, - CUBLAS_OP_N, - CUBLAS_OP_N, - m, - n, - &alpha, - A_d, - m, - &beta, - P_d.data(), - m, - A_minus_P.data(), - m, - stream)); + RAFT_CUBLAS_TRY(cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); math_t percent_error = 100.0 * norm_A_minus_P / normA; diff --git a/cpp/include/raft/linalg/detail/transpose.hpp b/cpp/include/raft/linalg/detail/transpose.hpp new file mode 100644 index 0000000000..b55843bd96 --- /dev/null +++ b/cpp/include/raft/linalg/detail/transpose.hpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include "cublas_wrappers.hpp" + +namespace raft { +namespace linalg { +namespace detail { + +template +void transpose(const raft::handle_t& handle, + math_t* in, + math_t* out, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + cublasHandle_t cublas_h = handle.get_cublas_handle(); + + int out_n_rows = n_cols; + int out_n_cols = n_rows; + + const math_t alpha = 1.0; + const math_t beta = 0.0; + RAFT_CUBLAS_TRY(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); +} + +template +void transpose(math_t* inout, int n, cudaStream_t stream) +{ + auto m = n; + auto size = n * n; + auto d_inout = inout; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp index ecf0d3a48d..a93ffe64fc 100644 --- a/cpp/include/raft/linalg/divide.hpp +++ b/cpp/include/raft/linalg/divide.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,7 @@ #pragma once -#include "detail/functional.cuh" -#include "unary_op.hpp" +#include "detail/divide.hpp" namespace raft { namespace linalg { @@ -38,7 +37,7 @@ using detail::divides_scalar; template void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) { - unaryOp(out, in, len, divides_scalar(scalar), stream); + detail::divideScalar(out, in, scalar, len, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp index 91a475f25f..1680e58cbf 100644 --- a/cpp/include/raft/linalg/eig.hpp +++ b/cpp/include/raft/linalg/eig.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,8 +50,6 @@ using detail::COPY_INPUT; using detail::EigVecMemUsage; using detail::OVERWRITE_INPUT; -#if CUDART_VERSION >= 10010 - /** * @defgroup eig decomp with divide and conquer method for the column-major * symmetric matrices @@ -80,8 +78,6 @@ void eigSelDC(const raft::handle_t& handle, detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream); } -#endif - /** * @defgroup overloaded function for eig decomp with Jacobi method for the * column-major symmetric matrices (in parameter) diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp index 5a5b5c647b..930a125be7 100644 --- a/cpp/include/raft/linalg/eltwise.hpp +++ b/cpp/include/raft/linalg/eltwise.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,7 @@ #pragma once -#include "detail/functional.cuh" - -#include "binary_op.hpp" -#include "unary_op.hpp" +#include "detail/eltwise.hpp" namespace raft { namespace linalg { @@ -40,7 +37,7 @@ using detail::adds_scalar; template void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), stream); + detail::scalarAdd(out, in, scalar, len, stream); } using detail::multiplies_scalar; @@ -48,7 +45,7 @@ using detail::multiplies_scalar; template void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp(out, in, len, multiplies_scalar(scalar), stream); + detail::scalarMultiply(out, in, scalar, len, stream); } /** @} */ @@ -67,28 +64,28 @@ template void eltwiseAdd( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp(out, in1, in2, len, thrust::plus(), stream); + detail::eltwiseAdd(out, in1, in2, len, stream); } template void eltwiseSub( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp(out, in1, in2, len, thrust::minus(), stream); + detail::eltwiseSub(out, in1, in2, len, stream); } template void eltwiseMultiply( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp(out, in1, in2, len, thrust::multiplies(), stream); + detail::eltwiseMultiply(out, in1, in2, len, stream); } template void eltwiseDivide( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp(out, in1, in2, len, thrust::divides(), stream); + detail::eltwiseDivide(out, in1, in2, len, stream); } using detail::divides_check_zero; @@ -97,7 +94,7 @@ template void eltwiseDivideCheckZero( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp(out, in1, in2, len, divides_check_zero(), stream); + detail::eltwiseDivideCheckZero(out, in1, in2, len, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp index 624aa7232b..19f79b2259 100644 --- a/cpp/include/raft/linalg/gemm.hpp +++ b/cpp/include/raft/linalg/gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,6 +58,22 @@ void gemm(const raft::handle_t& handle, handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } +/** + * @brief the wrapper of cublas gemm function + * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C + * @tparam math_t the type of input/output matrices + * @param handle raft handle + * @param a input matrix + * @param n_rows_a number of rows of A + * @param n_cols_a number of columns of A + * @param b input matrix + * @param c output matrix + * @param n_rows_c number of rows of C + * @param n_cols_c number of columns of C + * @param trans_a cublas transpose op for A + * @param trans_b cublas transpose op for B + * @param stream cuda stream + */ template void gemm(const raft::handle_t& handle, const math_t* a, @@ -71,10 +87,7 @@ void gemm(const raft::handle_t& handle, cublasOperation_t trans_b, cudaStream_t stream) { - math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm( - handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); + detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream); } /** diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp index 7dfd1f1db1..2a123e8895 100644 --- a/cpp/include/raft/linalg/gemv.hpp +++ b/cpp/include/raft/linalg/gemv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,7 @@ #pragma once -#include -#include -#include - -#include +#include "detail/gemv.hpp" namespace raft { namespace linalg { @@ -39,10 +35,7 @@ void gemv(const raft::handle_t& handle, const math_t beta, cudaStream_t stream) { - cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - RAFT_CUBLAS_TRY( - cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream)); + detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream); } /** @@ -53,10 +46,6 @@ void gemv(const raft::handle_t& handle, * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. * - * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. - * set it when you need to use only the first n_rows_a rows of the matrix A, which has - * (perhaps, due to padding) lda rows. - * * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. @@ -73,7 +62,7 @@ void gemv(const raft::handle_t& handle, const math_t beta, cudaStream_t stream) { - gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); + detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream); } /** @@ -98,10 +87,7 @@ void gemv(const raft::handle_t& handle, const bool trans_a, cudaStream_t stream) { - math_t alpha = math_t(1); - math_t beta = math_t(0); - - gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); + detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream); } /** @@ -137,10 +123,7 @@ void gemv(const raft::handle_t& handle, const math_t beta, cudaStream_t stream) { - cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - RAFT_CUBLAS_TRY( - cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); + detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); } /** @@ -171,9 +154,7 @@ void gemv(const raft::handle_t& handle, const bool trans_a, cudaStream_t stream) { - math_t alpha = math_t(1); - math_t beta = math_t(0); - gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); + detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream); } }; // namespace linalg diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp index 41ef4d4641..10498363e7 100644 --- a/cpp/include/raft/linalg/init.hpp +++ b/cpp/include/raft/linalg/init.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,16 +16,11 @@ #pragma once -#include -#include -#include -#include +#include "detail/init.hpp" namespace raft { namespace linalg { -namespace { - /** * @brief Like Python range. * @@ -39,10 +34,7 @@ namespace { template void range(T* out, int start, int end, cudaStream_t stream) { - thrust::counting_iterator first(start); - thrust::counting_iterator last = first + (end - start); - thrust::device_ptr ptr(out); - thrust::copy(rmm::exec_policy(stream), first, last, ptr); + detail::range(out, start, end, stream); } /** @@ -57,8 +49,8 @@ void range(T* out, int start, int end, cudaStream_t stream) template void range(T* out, int n, cudaStream_t stream) { - range(out, 0, n, stream); + detail::range(out, n, stream); } -} // unnamed namespace + } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 34db473edb..43164b676a 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp index c14fb7ba2b..1c4b6816ae 100644 --- a/cpp/include/raft/linalg/map.hpp +++ b/cpp/include/raft/linalg/map.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp index d4d7087339..48c0318798 100644 --- a/cpp/include/raft/linalg/map_then_reduce.hpp +++ b/cpp/include/raft/linalg/map_then_reduce.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp index f088ef4dce..a8a805b4c2 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.hpp +++ b/cpp/include/raft/linalg/matrix_vector_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp index f6318e1754..3a97a4396e 100644 --- a/cpp/include/raft/linalg/mean_squared_error.hpp +++ b/cpp/include/raft/linalg/mean_squared_error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include "map_then_reduce.hpp" +#include "detail/mean_squared_error.hpp" namespace raft { namespace linalg { @@ -36,11 +36,7 @@ template void meanSquaredError( math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) { - auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { - math_t diff = a - b; - return diff * diff * weight / len; - }; - mapThenSumReduce(out, len, sq_diff, stream, A, B); + detail::meanSquaredError(out, A, B, len, weight, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp index 66566692d5..edc84f2bcf 100644 --- a/cpp/include/raft/linalg/multiply.hpp +++ b/cpp/include/raft/linalg/multiply.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include "unary_op.hpp" +#include "detail/multiply.hpp" namespace raft { namespace linalg { @@ -35,8 +35,7 @@ namespace linalg { template void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) { - unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); + detail::multiplyScalar(out, in, scalar, len, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp index 5b0de91513..79b060454b 100644 --- a/cpp/include/raft/linalg/norm.hpp +++ b/cpp/include/raft/linalg/norm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,15 @@ #pragma once -#include "reduce.hpp" +#include "detail/norm.hpp" namespace raft { namespace linalg { /** different types of norms supported on the input buffers */ -enum NormType { L1Norm = 0, L2Norm }; +using detail::L1Norm; +using detail::L2Norm; +using detail::NormType; /** * @brief Compute row-wise norm of the input matrix and perform fin_op lambda @@ -54,37 +56,7 @@ void rowNorm(Type* dots, cudaStream_t stream, Lambda fin_op = raft::Nop()) { - switch (type) { - case L1Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); - break; - case L2Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); - break; - default: ASSERT(false, "Invalid norm type passed! [%d]", type); - }; + detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); } /** @@ -111,37 +83,7 @@ void colNorm(Type* dots, cudaStream_t stream, Lambda fin_op = raft::Nop()) { - switch (type) { - case L1Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); - break; - case L2Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); - break; - default: ASSERT(false, "Invalid norm type passed! [%d]", type); - }; + detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp index b0e9eed5e2..fb1c6be8be 100644 --- a/cpp/include/raft/linalg/qr.hpp +++ b/cpp/include/raft/linalg/qr.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp index 339245e946..ca9ad34dc8 100644 --- a/cpp/include/raft/linalg/reduce.hpp +++ b/cpp/include/raft/linalg/reduce.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,7 @@ #pragma once -#include -#include "coalesced_reduction.hpp" -#include "strided_reduction.hpp" +#include "detail/reduce.hpp" namespace raft { namespace linalg { @@ -71,15 +69,8 @@ void reduce(OutType* dots, ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); - } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); - } + detail::reduce( + dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp index 7e2b5229ec..f8c37f07e0 100644 --- a/cpp/include/raft/linalg/strided_reduction.hpp +++ b/cpp/include/raft/linalg/strided_reduction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp index 88946646c8..716db1a195 100644 --- a/cpp/include/raft/linalg/subtract.hpp +++ b/cpp/include/raft/linalg/subtract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,8 +38,7 @@ namespace linalg { template void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { - auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; - unaryOp(out, in, len, op, stream); + detail::subtractScalar(out, in, scalar, len, stream); } /** @@ -58,8 +57,7 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea template void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) { - auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; - binaryOp(out, in1, in2, len, op, stream); + detail::subtract(out, in1, in2, len, stream); } /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp index 62ac19b592..331796c2ca 100644 --- a/cpp/include/raft/linalg/svd.hpp +++ b/cpp/include/raft/linalg/svd.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,24 +148,7 @@ void svdReconstruction(const raft::handle_t& handle, int k, cudaStream_t stream) { - const math_t alpha = 1.0, beta = 0.0; - rmm::device_uvector SVT(k * n_cols, stream); - - raft::linalg::gemm( - handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, - U, - n_rows, - k, - SVT.data(), - out, - n_rows, - n_cols, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); + detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream); } /** diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp index 09e9e67e7b..4c16df331c 100644 --- a/cpp/include/raft/linalg/transpose.hpp +++ b/cpp/include/raft/linalg/transpose.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,7 @@ #pragma once -#include -#include -#include +#include "detail/transpose.hpp" namespace raft { namespace linalg { @@ -40,27 +38,7 @@ void transpose(const raft::handle_t& handle, int n_cols, cudaStream_t stream) { - cublasHandle_t cublas_h = handle.get_cublas_handle(); - - int out_n_rows = n_cols; - int out_n_cols = n_rows; - - const math_t alpha = 1.0; - const math_t beta = 0.0; - RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_n_rows, - out_n_cols, - &alpha, - in, - n_rows, - &beta, - out, - out_n_rows, - out, - out_n_rows, - stream)); + detail::transpose(handle, in, out, n_rows, n_cols, stream); } /** @@ -72,22 +50,7 @@ void transpose(const raft::handle_t& handle, template void transpose(math_t* inout, int n, cudaStream_t stream) { - auto m = n; - auto size = n * n; - auto d_inout = inout; - auto counting = thrust::make_counting_iterator(0); - - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { - int s_row = idx % m; - int s_col = idx / m; - int d_row = s_col; - int d_col = s_row; - if (s_row < s_col) { - auto temp = d_inout[d_col * m + d_row]; - d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; - d_inout[s_col * m + s_row] = temp; - } - }); + detail::transpose(inout, n, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp index c54e3cc1c3..a7753ccff7 100644 --- a/cpp/include/raft/linalg/unary_op.hpp +++ b/cpp/include/raft/linalg/unary_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index 00651a9b62..42be8d7bab 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include namespace raft { namespace matrix { @@ -285,7 +285,8 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st { cublasHandle_t cublasH = handle.get_cublas_handle(); m_t normval = 0; - RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 141e5b3e5f..21faffc17d 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index 0f8b2d99bb..63c39457af 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 62bfb7671e..ef578f0cf1 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index d062705b57..78f131b04b 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index c49730bdb9..8a709ae5ea 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 0c47b22201..e184d2be6e 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index 21a40cf626..947610d8cf 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh index c96fefdc5d..c1f98eae12 100644 --- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh @@ -24,7 +24,7 @@ #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp index bfc0c14a8c..bb5edd2f17 100644 --- a/cpp/include/raft/sparse/selection/knn.hpp +++ b/cpp/include/raft/sparse/selection/knn.hpp @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp index 2a3159900c..357a65447c 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.hpp +++ b/cpp/include/raft/sparse/selection/knn_graph.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index e2df51a62b..ce1385e688 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp index 4495221a34..23191f9415 100644 --- a/cpp/include/raft/spatial/knn/ball_cover.hpp +++ b/cpp/include/raft/spatial/knn/ball_cover.hpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include "ball_cover_common.h" #include "detail/ball_cover.cuh" #include "detail/ball_cover/common.cuh" diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h index 9ed1d2f726..e1a202107b 100644 --- a/cpp/include/raft/spatial/knn/ball_cover_common.h +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -17,8 +17,8 @@ #pragma once #include +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 6f223fdb43..b5d5e48231 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -43,7 +43,7 @@ #include -#include +#include #include diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 3708523b4f..bf10356bfa 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -20,7 +20,7 @@ #include #include -#include +#include namespace raft { namespace spatial { diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 50340a284b..8faf76f096 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -26,8 +26,8 @@ #include #include +#include #include -#include namespace raft { namespace spatial { diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 54509b4a51..04b2dc3098 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -32,8 +32,8 @@ #include #include #include +#include #include -#include #include #include "fused_l2_knn.cuh" diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 5a4672e711..a515ca8507 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include #include diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index cbd0486086..56f4022a8c 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include @@ -657,20 +657,21 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); @@ -852,7 +853,9 @@ int kmeans(handle_t const& handle, } // Initialize cuBLAS - RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index a47c41564c..d066c68a68 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -18,8 +18,8 @@ #include #include -#include -#include +#include +#include // for now; TODO: check if/where this `define` should be; // diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 6f9d383c63..d463b1b590 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include @@ -349,7 +349,8 @@ struct laplacian_matrix_t : sparse_matrix_t { if (beta == 0) { CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); } else if (beta != 1) { - RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream)); } // Apply diagonal matrix @@ -412,7 +413,9 @@ struct modularity_matrix_t : laplacian_matrix_t { // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h, + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_h, n, laplacian_matrix_t::diagonal_.raw(), 1, @@ -424,7 +427,9 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h, + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasaxpy(cublas_h, n, &gamma_, laplacian_matrix_t::diagonal_.raw(), diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index c61b5f1458..8188a772b8 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -160,7 +160,9 @@ void analyzeModularity(handle_t const& handle, vector_t Bx(handle, n); // Initialize cuBLAS - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + // #TODO: Use public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, csr_m}; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index a30906de10..6b57566a73 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -132,7 +133,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::minus()); RAFT_CHECK_CUDA(stream); - RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -149,22 +152,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t*)NULL, - nEigVecs, - work.raw(), - nEigVecs, - stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); RAFT_CUDA_TRY(cudaMemcpyAsync( eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); @@ -216,14 +222,18 @@ bool construct_indicator(handle_t const& handle, RAFT_CHECK_CUDA(stream); // Compute size of ith partition - RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot( + cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 85f5d7ada1..6583d8d23c 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include @@ -42,7 +42,8 @@ class CholeskyR1Test : public ::testing::Test { // Allocate workspace solver_handle = handle.get_cusolver_dn_handle(); - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize( + // TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize( solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork)); int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace @@ -72,15 +73,16 @@ class CholeskyR1Test : public ::testing::Test { // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, handle.get_stream()); - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle, - uplo, - rank, - L_exp.data(), - n_rows, - (math_t*)workspace.data(), - Lwork, - devInfo.data(), - handle.get_stream())); + // TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + handle.get_stream())); // Incremental Cholesky factorization using rank one updates. raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index aae57e136e..c5728cd8d4 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -65,7 +65,8 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t raft::linalg::unaryOp( ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - RAFT_CUBLAS_TRY(raft::linalg::cublasgemv( + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv( handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream)); RAFT_CUDA_TRY(cudaDeviceSynchronize()); RAFT_CUBLAS_TRY(cublasDestroy(handle)); diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index 2c56a902d4..850ecd72c8 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index eae8fec500..6faedfa137 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index d635c4a813..39b936573f 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -20,7 +20,7 @@ #include #include -#include +#include #include diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 6e4de63e4d..c5858610da 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include "../test_utils.h" diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 51947167cf..cb09b9e7f5 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index d63674c13c..fb5e72141b 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index 303844b0a4..40c16eed09 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index d28fd55dbe..3b720de505 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 839d60095e..ecd76e5598 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -16,7 +16,7 @@ #include "../test_utils.h" -#include +#include #include #include From 34b24396c43af190214903427c6d4197dae5d531 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 21 Dec 2021 19:32:36 -0800 Subject: [PATCH 09/17] correcting doxygen build --- cpp/include/raft/linalg/detail/eig.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index a74384e479..c04a939652 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -139,20 +139,6 @@ void eigDC(const raft::handle_t& handle, enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; -/** - * @defgroup eig decomp with divide and conquer method for the column-major - * symmetric matrices - * @param handle raft handle - * @param in the input buffer (symmetric matrix that has real eig values and - * vectors. - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param n_eig_vals: number of eigenvectors to be generated - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param stream cuda stream - * @{ - */ template void eigSelDC(const raft::handle_t& handle, math_t* in, From 897e6f7f7745cd63658607941239e1b71527a69e Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 22 Dec 2021 12:15:02 -0800 Subject: [PATCH 10/17] correcting wrong docs --- cpp/include/raft/linalg/eig.hpp | 18 +++++++++----- cpp/include/raft/linalg/gemv.hpp | 40 +++++++++++++++++++------------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp index 1680e58cbf..3ddf5419b9 100644 --- a/cpp/include/raft/linalg/eig.hpp +++ b/cpp/include/raft/linalg/eig.hpp @@ -22,7 +22,12 @@ namespace raft { namespace linalg { /** - * @defgroup eig decomp with divide and conquer method for the column-major + * @defgroup eig Eigen Decomposition Methods + * @{ + */ + +/** + * @brief eig decomp with divide and conquer method for the column-major * symmetric matrices * @param handle raft handle * @param in the input buffer (symmetric matrix that has real eig values and @@ -32,7 +37,6 @@ namespace linalg { * @param eig_vectors: eigenvectors * @param eig_vals: eigen values * @param stream cuda stream - * @{ */ template void eigDC(const raft::handle_t& handle, @@ -51,7 +55,7 @@ using detail::EigVecMemUsage; using detail::OVERWRITE_INPUT; /** - * @defgroup eig decomp with divide and conquer method for the column-major + * @brief eig sel decomp with divide and conquer method for the column-major * symmetric matrices * @param handle raft handle * @param in the input buffer (symmetric matrix that has real eig values and @@ -61,8 +65,8 @@ using detail::OVERWRITE_INPUT; * @param n_eig_vals: number of eigenvectors to be generated * @param eig_vectors: eigenvectors * @param eig_vals: eigen values + * @param memUsage: the memory selection for eig vector output * @param stream cuda stream - * @{ */ template void eigSelDC(const raft::handle_t& handle, @@ -79,18 +83,19 @@ void eigSelDC(const raft::handle_t& handle, } /** - * @defgroup overloaded function for eig decomp with Jacobi method for the + * @brief overloaded function for eig decomp with Jacobi method for the * column-major symmetric matrices (in parameter) * @param handle: raft handle + * @param in: input matrix * @param n_rows: number of rows of the input * @param n_cols: number of cols of the input * @param eig_vectors: eigenvectors * @param eig_vals: eigen values + * @param stream: stream on which this function will be run * @param tol: error tolerance for the jacobi method. Algorithm stops when the * error is below tol * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better * accuracy. - * @{ */ template void eigJacobi(const raft::handle_t& handle, @@ -105,6 +110,7 @@ void eigJacobi(const raft::handle_t& handle, { detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); } +/** @} */ // end of eig }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp index 2a123e8895..dabb1f121a 100644 --- a/cpp/include/raft/linalg/gemv.hpp +++ b/cpp/include/raft/linalg/gemv.hpp @@ -43,12 +43,17 @@ void gemv(const raft::handle_t& handle, * * where * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param alpha is a scalar scale of Ax. + * @param beta is a scalar scale of y. + * @param stream stream on which this function is run */ template void gemv(const raft::handle_t& handle, @@ -70,12 +75,15 @@ void gemv(const raft::handle_t& handle, * * where * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param stream stream on which this function is run */ template void gemv(const raft::handle_t& handle, @@ -94,21 +102,20 @@ void gemv(const raft::handle_t& handle, * y = alpha * op(A) * x + beta * y * * where - * - * @param alpha is a scalar scale of Ax. - * - * @param beta is a scalar scale of y. - * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. * set it when you need to use only the first n_rows_a rows of the matrix A, which has * (perhaps, due to padding) lda rows. - * * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param alpha is a scalar scale of Ax. + * @param beta is a scalar scale of y. + * @param stream stream on which this function is run */ template void gemv(const raft::handle_t& handle, @@ -130,17 +137,18 @@ void gemv(const raft::handle_t& handle, * y = op(A) * x * * where - * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. * set it when you need to use only the first n_rows_a rows of the matrix A, which has * (perhaps, due to padding) lda rows. - * * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param stream stream on which this function is run * */ template From 3d4b5f1e4d30e44f46f855e6422a1e105d1fbd8f Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 11 Jan 2022 10:40:41 -0800 Subject: [PATCH 11/17] review feedback --- cpp/include/raft/linalg/add.hpp | 2 +- cpp/include/raft/linalg/binary_op.hpp | 2 +- .../raft/linalg/cholesky_r1_update.hpp | 2 +- .../raft/linalg/coalesced_reduction.hpp | 2 +- cpp/include/raft/linalg/contractions.hpp | 2 +- cpp/include/raft/linalg/detail/add.cuh | 2 +- cpp/include/raft/linalg/detail/binary_op.cuh | 2 +- .../raft/linalg/detail/cholesky_r1_update.hpp | 2 +- .../linalg/detail/coalesced_reduction.cuh | 2 +- .../raft/linalg/detail/contractions.cuh | 2 +- .../raft/linalg/detail/cublas_wrappers.hpp | 2 +- .../raft/linalg/detail/cusolver_wrappers.hpp | 2 +- cpp/include/raft/linalg/detail/divide.hpp | 2 +- cpp/include/raft/linalg/detail/eig.hpp | 2 +- cpp/include/raft/linalg/detail/eltwise.hpp | 2 +- cpp/include/raft/linalg/detail/functional.cuh | 2 +- cpp/include/raft/linalg/detail/gemm.hpp | 2 +- cpp/include/raft/linalg/detail/gemv.hpp | 2 +- cpp/include/raft/linalg/detail/init.hpp | 2 +- cpp/include/raft/linalg/detail/lanczos.hpp | 161 ++++++++++--- cpp/include/raft/linalg/detail/map.cuh | 2 +- .../raft/linalg/detail/map_then_reduce.cuh | 2 +- .../raft/linalg/detail/matrix_vector_op.cuh | 2 +- .../raft/linalg/detail/mean_squared_error.hpp | 2 +- cpp/include/raft/linalg/detail/multiply.hpp | 2 +- cpp/include/raft/linalg/detail/norm.hpp | 2 +- cpp/include/raft/linalg/detail/qr.cuh | 2 +- cpp/include/raft/linalg/detail/reduce.hpp | 2 +- .../raft/linalg/detail/strided_reduction.cuh | 2 +- cpp/include/raft/linalg/detail/subtract.cuh | 2 +- cpp/include/raft/linalg/detail/svd.hpp | 2 +- cpp/include/raft/linalg/detail/transpose.hpp | 2 +- cpp/include/raft/linalg/detail/unary_op.cuh | 2 +- cpp/include/raft/linalg/divide.hpp | 2 +- cpp/include/raft/linalg/eig.hpp | 2 +- cpp/include/raft/linalg/eltwise.hpp | 2 +- cpp/include/raft/linalg/gemm.hpp | 2 +- cpp/include/raft/linalg/gemv.hpp | 2 +- cpp/include/raft/linalg/init.hpp | 2 +- cpp/include/raft/linalg/lanczos.hpp | 217 ++---------------- cpp/include/raft/linalg/map.hpp | 2 +- cpp/include/raft/linalg/map_then_reduce.hpp | 2 +- cpp/include/raft/linalg/matrix_vector_op.hpp | 2 +- .../raft/linalg/mean_squared_error.hpp | 2 +- cpp/include/raft/linalg/multiply.hpp | 2 +- cpp/include/raft/linalg/norm.hpp | 2 +- cpp/include/raft/linalg/qr.hpp | 2 +- cpp/include/raft/linalg/reduce.hpp | 2 +- cpp/include/raft/linalg/strided_reduction.hpp | 2 +- cpp/include/raft/linalg/subtract.hpp | 2 +- cpp/include/raft/linalg/svd.hpp | 2 +- cpp/include/raft/linalg/transpose.hpp | 2 +- cpp/include/raft/linalg/unary_op.hpp | 2 +- 53 files changed, 201 insertions(+), 279 deletions(-) diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp index 08496eef0d..2f999a45d2 100644 --- a/cpp/include/raft/linalg/add.hpp +++ b/cpp/include/raft/linalg/add.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp index 12afcbcd9a..5c73b6d3c5 100644 --- a/cpp/include/raft/linalg/binary_op.hpp +++ b/cpp/include/raft/linalg/binary_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp index 9dc9630a86..583c65c50e 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp index 00ac7b4be9..0f1ca9202d 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.hpp +++ b/cpp/include/raft/linalg/coalesced_reduction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp index ae6832bd7a..e317588b1d 100644 --- a/cpp/include/raft/linalg/contractions.hpp +++ b/cpp/include/raft/linalg/contractions.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index 7924f11e90..794a776dcf 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh index 7c9ba2aeed..6b1f8bc6d7 100644 --- a/cpp/include/raft/linalg/detail/binary_op.cuh +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp index 5f88c36a0c..d070e47b31 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh index bb451bf13a..7e545e4932 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh index d5dd416c49..40d0839f60 100644 --- a/cpp/include/raft/linalg/detail/contractions.cuh +++ b/cpp/include/raft/linalg/detail/contractions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp index 5c8779b0cf..83890f348a 100644 --- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index 2ff6825ea9..171227498d 100644 --- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp index ad579a31f0..c694529fb5 100644 --- a/cpp/include/raft/linalg/detail/divide.hpp +++ b/cpp/include/raft/linalg/detail/divide.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index c04a939652..a27a65efd8 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp index e60c97e0e6..b15717f205 100644 --- a/cpp/include/raft/linalg/detail/eltwise.hpp +++ b/cpp/include/raft/linalg/detail/eltwise.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh index 4cebd34d08..067b1565e0 100644 --- a/cpp/include/raft/linalg/detail/functional.cuh +++ b/cpp/include/raft/linalg/detail/functional.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index ca01f20d8b..28f12084f7 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp index 246db6e58b..991268cf26 100644 --- a/cpp/include/raft/linalg/detail/gemv.hpp +++ b/cpp/include/raft/linalg/detail/gemv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp index 5aa8d8247e..4718a2cb0e 100644 --- a/cpp/include/raft/linalg/detail/init.hpp +++ b/cpp/include/raft/linalg/detail/init.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index c585e23a20..9ee2f6cdc9 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -700,6 +700,55 @@ static int lanczosRestart(handle_t const& handle, namespace detail { +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param shift On exit, pointer to matrix shift (estimate for + * largest eigenvalue). + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ template int computeSmallestEigenvectors(handle_t const& handle, sparse_matrix_t const* A, @@ -980,28 +1029,72 @@ int computeSmallestEigenvectors(handle_t const& handle, // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = raft::detail::computeSmallestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; } +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ template int computeLargestEigenvectors(handle_t const& handle, sparse_matrix_t const* A, @@ -1282,22 +1375,22 @@ int computeLargestEigenvectors(handle_t const& handle, // Perform Lanczos method index_type_t effIter; - int status = raft::detail::computeLargestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 7f1ba3da0d..513432ef27 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index 089bc627be..99e04d82e7 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh index 972bd793ab..e7debb0bee 100644 --- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp index 2ef9479b87..f0a9daebdb 100644 --- a/cpp/include/raft/linalg/detail/mean_squared_error.hpp +++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp index 2cd83920c5..da06c23aed 100644 --- a/cpp/include/raft/linalg/detail/multiply.hpp +++ b/cpp/include/raft/linalg/detail/multiply.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp index 492f34e59d..fcf98c7daf 100644 --- a/cpp/include/raft/linalg/detail/norm.hpp +++ b/cpp/include/raft/linalg/detail/norm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh index 2abb61db59..a250dd3578 100644 --- a/cpp/include/raft/linalg/detail/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp index 181a7d52b1..94c8f5ba52 100644 --- a/cpp/include/raft/linalg/detail/reduce.hpp +++ b/cpp/include/raft/linalg/detail/reduce.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index f9313088f9..a0d1e2abaa 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index 26fe258825..767373574b 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/svd.hpp b/cpp/include/raft/linalg/detail/svd.hpp index df61d20274..796adc89ff 100644 --- a/cpp/include/raft/linalg/detail/svd.hpp +++ b/cpp/include/raft/linalg/detail/svd.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/transpose.hpp b/cpp/include/raft/linalg/detail/transpose.hpp index 9dda6e5991..659d3a8ef6 100644 --- a/cpp/include/raft/linalg/detail/transpose.hpp +++ b/cpp/include/raft/linalg/detail/transpose.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh index d419a9ed7b..9ddfe79657 100644 --- a/cpp/include/raft/linalg/detail/unary_op.cuh +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp index a93ffe64fc..6c8480bf19 100644 --- a/cpp/include/raft/linalg/divide.hpp +++ b/cpp/include/raft/linalg/divide.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp index 3ddf5419b9..5c465a3a41 100644 --- a/cpp/include/raft/linalg/eig.hpp +++ b/cpp/include/raft/linalg/eig.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp index 930a125be7..5c2a97b57d 100644 --- a/cpp/include/raft/linalg/eltwise.hpp +++ b/cpp/include/raft/linalg/eltwise.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp index 19f79b2259..04ddbb3561 100644 --- a/cpp/include/raft/linalg/gemm.hpp +++ b/cpp/include/raft/linalg/gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp index dabb1f121a..e8d378c187 100644 --- a/cpp/include/raft/linalg/gemv.hpp +++ b/cpp/include/raft/linalg/gemv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp index 10498363e7..bb577672e8 100644 --- a/cpp/include/raft/linalg/init.hpp +++ b/cpp/include/raft/linalg/init.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 43164b676a..e7d965f810 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,93 +24,6 @@ namespace raft { // Eigensolver // ========================================================= -/** - * @brief Compute smallest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param effIter On exit, pointer to final size of Lanczos system. - * @param totalIter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param shift On exit, pointer to matrix shift (estimate for - * largest eigenvalue). - * @param alpha_host (Output, host memory, restartIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, restartIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (restartIter+1). - * @param work_dev (Output, device memory, - * (n+restartIter)*restartIter entries) Workspace. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. - */ -template -int computeSmallestEigenvectors(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t* effIter, - index_type_t* totalIter, - value_type_t* shift, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed) -{ - return raft::detail::computeSmallestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - effIter, - totalIter, - shift, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev, - eigVals_dev, - eigVecs_dev, - seed); -} - /** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least @@ -161,101 +74,17 @@ int computeSmallestEigenvectors(handle_t const& handle, value_type_t* __restrict__ eigVecs_dev, unsigned long long seed = 1234567) { - return raft::detail::computeSmallestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - iter, - eigVals_dev, - eigVecs_dev, - seed); -} - -// ========================================================= -// Eigensolver -// ========================================================= - -/** - * @brief Compute largest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the largest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th largest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param effIter On exit, pointer to final size of Lanczos system. - * @param totalIter On exit, pointer to total number of Lanczos - * iterations performed. - * @param alpha_host (Output, host memory, restartIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, restartIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (restartIter+1). - * @param work_dev (Output, device memory, - * (n+restartIter)*restartIter entries) Workspace. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to largest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. - */ -template -int computeLargestEigenvectors(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t* effIter, - index_type_t* totalIter, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed) -{ - return raft::detail::computeLargestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - effIter, - totalIter, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev, - eigVals_dev, - eigVecs_dev, - seed); + return detail::computeSmallestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); } /** @@ -308,17 +137,17 @@ int computeLargestEigenvectors(handle_t const& handle, value_type_t* __restrict__ eigVecs_dev, unsigned long long seed = 123456) { - return raft::detail::computeLargestEigenvectors(handle, - A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - iter, - eigVals_dev, - eigVecs_dev, - seed); + return detail::computeLargestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); } } // namespace raft diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp index 1c4b6816ae..febeaa8621 100644 --- a/cpp/include/raft/linalg/map.hpp +++ b/cpp/include/raft/linalg/map.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp index 48c0318798..04275995a0 100644 --- a/cpp/include/raft/linalg/map_then_reduce.hpp +++ b/cpp/include/raft/linalg/map_then_reduce.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp index a8a805b4c2..b9790ebce2 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.hpp +++ b/cpp/include/raft/linalg/matrix_vector_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp index 3a97a4396e..42af8642b6 100644 --- a/cpp/include/raft/linalg/mean_squared_error.hpp +++ b/cpp/include/raft/linalg/mean_squared_error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp index edc84f2bcf..4a1628b44a 100644 --- a/cpp/include/raft/linalg/multiply.hpp +++ b/cpp/include/raft/linalg/multiply.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp index 79b060454b..a6336769ca 100644 --- a/cpp/include/raft/linalg/norm.hpp +++ b/cpp/include/raft/linalg/norm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp index fb1c6be8be..50e97e4069 100644 --- a/cpp/include/raft/linalg/qr.hpp +++ b/cpp/include/raft/linalg/qr.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp index ca9ad34dc8..1c4ef70df8 100644 --- a/cpp/include/raft/linalg/reduce.hpp +++ b/cpp/include/raft/linalg/reduce.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp index f8c37f07e0..0f97323e5a 100644 --- a/cpp/include/raft/linalg/strided_reduction.hpp +++ b/cpp/include/raft/linalg/strided_reduction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp index 716db1a195..9d48948cad 100644 --- a/cpp/include/raft/linalg/subtract.hpp +++ b/cpp/include/raft/linalg/subtract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp index c18c73eaed..a30180b174 100644 --- a/cpp/include/raft/linalg/svd.hpp +++ b/cpp/include/raft/linalg/svd.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp index 4c16df331c..50608877fa 100644 --- a/cpp/include/raft/linalg/transpose.hpp +++ b/cpp/include/raft/linalg/transpose.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp index a7753ccff7..51faa2e4a4 100644 --- a/cpp/include/raft/linalg/unary_op.hpp +++ b/cpp/include/raft/linalg/unary_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From b6471d665d62787975e9872feaef3e7e978957a3 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 26 Jan 2022 15:06:22 -0800 Subject: [PATCH 12/17] review changes --- cpp/include/raft/distance/distance_type.hpp | 2 +- .../raft/linalg/detail/cusolver_wrappers.hpp | 30 -- cpp/include/raft/linalg/detail/eig.hpp | 2 - cpp/include/raft/linalg/detail/lanczos.hpp | 403 +++++++++--------- 4 files changed, 203 insertions(+), 234 deletions(-) diff --git a/cpp/include/raft/distance/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp index 7a15c97f48..f75263b00d 100644 --- a/cpp/include/raft/distance/distance_type.hpp +++ b/cpp/include/raft/distance/distance_type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index 171227498d..aac58547f8 100644 --- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -143,7 +143,6 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } @@ -158,7 +157,6 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } @@ -225,7 +223,6 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } @@ -242,7 +239,6 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -324,7 +320,6 @@ inline cusolverStatus_t cusolverDnsyevj( // NOLINT syevjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } @@ -343,7 +338,6 @@ inline cusolverStatus_t cusolverDnsyevj( // NOLINT syevjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } @@ -420,7 +414,6 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } @@ -437,7 +430,6 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ @@ -545,7 +537,6 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevdx( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } @@ -570,7 +561,6 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevdx( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } @@ -632,7 +622,6 @@ inline cusolverStatus_t cusolverDngesvd( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvd( handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } @@ -656,7 +645,6 @@ inline cusolverStatus_t cusolverDngesvd( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvd( handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } @@ -756,7 +744,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT gesvdjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvdj( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } @@ -780,7 +767,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT gesvdjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvdj( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } @@ -845,7 +831,6 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -860,7 +845,6 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } /** @} */ @@ -893,7 +877,6 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -909,7 +892,6 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } /** @} */ @@ -941,7 +923,6 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> @@ -956,7 +937,6 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } @@ -1023,7 +1003,6 @@ inline cusolverStatus_t cusolverDnorgqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> @@ -1040,7 +1019,6 @@ inline cusolverStatus_t cusolverDnorgqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } @@ -1121,7 +1099,6 @@ inline cusolverStatus_t cusolverDnormqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } @@ -1143,7 +1120,6 @@ inline cusolverStatus_t cusolverDnormqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } @@ -1310,7 +1286,6 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); return cusolverSpScsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1332,7 +1307,6 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); return cusolverSpDcsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1371,7 +1345,6 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT size_t* workspaceInBytesOnHost, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd_bufferSize(handle, params, jobz, @@ -1401,7 +1374,6 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT size_t* workspaceInBytesOnHost, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd_bufferSize(handle, params, jobz, @@ -1451,7 +1423,6 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT int* info, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd(handle, params, jobz, @@ -1487,7 +1458,6 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT int* info, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd(handle, params, jobz, diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp index a27a65efd8..8716b4de29 100644 --- a/cpp/include/raft/linalg/detail/eig.hpp +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -16,8 +16,6 @@ #pragma once -#include "cusolver_wrappers.hpp" - #include "cusolver_wrappers.hpp" #include #include diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp index 9ee2f6cdc9..c761c06c14 100644 --- a/cpp/include/raft/linalg/detail/lanczos.hpp +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -123,28 +123,28 @@ int performLanczosIteration(handle_t const& handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, - lanczosVecs_dev, - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot( + RAFT_CUBLAS_TRY(cublasdot( cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy( + RAFT_CUBLAS_TRY(cublasaxpy( cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -156,113 +156,113 @@ int performLanczosIteration(handle_t const& handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, - lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(value_type_t), - cudaMemcpyDeviceToHost, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, - n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1), - stream)); + RAFT_CUBLAS_TRY(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2( + RAFT_CUBLAS_TRY(cublasnrm2( cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged @@ -270,10 +270,10 @@ int performLanczosIteration(handle_t const& handle, // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } - CUDA_TRY(cudaStreamSynchronize(stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); return 0; } @@ -638,59 +638,60 @@ static int lanczosRestart(handle_t const& handle, WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1, - stream)); + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - iter_new, - iter, - &one, - lanczosVecs_dev, - n, - V_dev, - iter, - &zero, - work_dev, - n, - stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, - work_dev, - n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, + n * iter_new * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), - lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); - CUBLAS_CHECK(cublasnrm2( + RAFT_CUBLAS_TRY(cublasnrm2( cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + RAFT_CUBLAS_TRY( + cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -821,7 +822,7 @@ int computeSmallestEigenvectors(handle_t const& handle, work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -837,10 +838,10 @@ int computeSmallestEigenvectors(handle_t const& handle, // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); value_type_t normQ1; - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos *effIter = 0; @@ -956,35 +957,35 @@ int computeSmallestEigenvectors(handle_t const& handle, work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, - work_host + 2 * (*effIter), - nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); - - CUDA_TRY(cudaMemcpyAsync(work_dev, - Z_host, - (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); @@ -1167,7 +1168,7 @@ int computeLargestEigenvectors(handle_t const& handle, work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1181,10 +1182,10 @@ int computeLargestEigenvectors(handle_t const& handle, // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); value_type_t normQ1; - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos *effIter = 0; @@ -1303,37 +1304,37 @@ int computeLargestEigenvectors(handle_t const& handle, // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, - work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); // skip smallest eigenvector if needed - CUDA_TRY(cudaMemcpyAsync(work_dev, - Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host + (top_eigenparis_idx_offset * (*effIter)), + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); From 5d8c176de6706aecc96d925b653aae68f22edefa Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 26 Jan 2022 16:39:36 -0800 Subject: [PATCH 13/17] more macro renames --- cpp/include/raft/linalg/detail/map.cuh | 2 +- cpp/include/raft/linalg/detail/map_then_reduce.cuh | 2 +- cpp/include/raft/linalg/detail/subtract.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 513432ef27..56f1dd6f19 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -40,7 +40,7 @@ void mapImpl( const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // namespace detail diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index 99e04d82e7..281861b2f9 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -89,7 +89,7 @@ void mapThenReduceImpl(OutType* out, const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel <<>>(out, len, neutral, map, op, in, args...); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // end namespace detail diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index 767373574b..23d5eded05 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -61,7 +61,7 @@ void subtractDevScalar(math_t* outDev, const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); subtract_dev_scalar_kernel <<>>(outDev, inDev, singleScalarDev, len); - CUDA_CHECK(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } }; // end namespace detail From 14cddfc2c1fb4db232969247cd960c44f928abe3 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 2 Feb 2022 13:12:11 -0800 Subject: [PATCH 14/17] adding explict stream set back to cublas and cusolver wrappers --- .../raft/linalg/detail/cublas_wrappers.hpp | 36 +++++++++++++ .../raft/linalg/detail/cusolver_wrappers.hpp | 52 +++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp index 83890f348a..9d8d477355 100644 --- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp @@ -143,6 +143,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } @@ -156,6 +157,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } /** @} */ @@ -172,6 +174,7 @@ template <> inline cublasStatus_t cublasSwap( cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } @@ -179,6 +182,7 @@ template <> inline cublasStatus_t cublasSwap( cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -196,12 +200,14 @@ template <> inline cublasStatus_t cublasCopy( cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> inline cublasStatus_t cublasCopy( cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } /** @} */ @@ -240,6 +246,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } @@ -258,6 +265,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, int incy, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -291,6 +299,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int lda, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -307,6 +316,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int lda, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } /** @} */ @@ -349,6 +359,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } @@ -369,6 +380,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -414,6 +426,7 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT int batchCount, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemmBatched(handle, transa, transb, @@ -450,6 +463,7 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT int batchCount, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemmBatched(handle, transa, transb, @@ -516,6 +530,7 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT int batchCount, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemmStridedBatched(handle, transa, transb, @@ -558,6 +573,7 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT int batchCount, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemmStridedBatched(handle, transa, transb, @@ -604,6 +620,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } @@ -617,6 +634,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } @@ -645,6 +663,7 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT int batchSize, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -661,6 +680,7 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT int batchSize, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -701,6 +721,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgelsBatched( handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } @@ -720,6 +741,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgelsBatched( handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } @@ -762,6 +784,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } @@ -781,6 +804,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -821,6 +845,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } @@ -840,6 +865,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -876,6 +902,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -893,6 +920,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, int ldc, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } /** @} */ @@ -909,6 +937,7 @@ template <> inline cublasStatus_t cublasnrm2( cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } @@ -916,6 +945,7 @@ template <> inline cublasStatus_t cublasnrm2( cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ @@ -950,6 +980,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, int ldb, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } @@ -968,6 +999,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, int ldb, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } @@ -995,6 +1027,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, float* result, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } @@ -1008,6 +1041,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, double* result, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } /** @} */ @@ -1044,6 +1078,7 @@ template <> inline cublasStatus_t cublasscal( cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } @@ -1051,6 +1086,7 @@ template <> inline cublasStatus_t cublasscal( cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index aac58547f8..acfd239174 100644 --- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -143,6 +143,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } @@ -157,6 +158,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } @@ -178,6 +180,7 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT int lda, int* Lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -190,6 +193,7 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT int lda, int* Lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -223,6 +227,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } @@ -239,6 +244,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -269,6 +275,7 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT const float* W, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } @@ -283,6 +290,7 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT const double* W, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -320,6 +328,7 @@ inline cusolverStatus_t cusolverDnsyevj( // NOLINT syevjInfo_t params, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } @@ -338,6 +347,7 @@ inline cusolverStatus_t cusolverDnsyevj( // NOLINT syevjInfo_t params, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } @@ -365,6 +375,7 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT int* lwork, syevjInfo_t params) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } @@ -380,6 +391,7 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT int* lwork, syevjInfo_t params) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } /** @} */ @@ -414,6 +426,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } @@ -430,6 +443,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ @@ -472,6 +486,7 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT const float* W, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevdx_bufferSize( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } @@ -493,6 +508,7 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT const double* W, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevdx_bufferSize( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } @@ -537,6 +553,7 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevdx( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } @@ -561,6 +578,7 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevdx( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } @@ -578,8 +596,10 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT int* lwork) { if (std::is_same, float>::value) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvd_bufferSize(handle, m, n, lwork); } } @@ -622,6 +642,7 @@ inline cusolverStatus_t cusolverDngesvd( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvd( handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } @@ -645,6 +666,7 @@ inline cusolverStatus_t cusolverDngesvd( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvd( handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } @@ -682,6 +704,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT int* lwork, gesvdjInfo_t params) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvdj_bufferSize( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } @@ -702,6 +725,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT int* lwork, gesvdjInfo_t params) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvdj_bufferSize( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } @@ -744,6 +768,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT gesvdjInfo_t params, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvdj( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } @@ -767,6 +792,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT gesvdjInfo_t params, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvdj( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } @@ -794,6 +820,7 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT int lda, int* Lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } @@ -806,6 +833,7 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT int lda, int* Lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } @@ -831,6 +859,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -845,6 +874,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } /** @} */ @@ -877,6 +907,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -892,6 +923,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } /** @} */ @@ -923,6 +955,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> @@ -937,6 +970,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } @@ -957,6 +991,7 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT int lda, int* Lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> @@ -968,6 +1003,7 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT int lda, int* Lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -1003,6 +1039,7 @@ inline cusolverStatus_t cusolverDnorgqr( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> @@ -1019,6 +1056,7 @@ inline cusolverStatus_t cusolverDnorgqr( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } @@ -1043,6 +1081,7 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT const float* TAU, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> @@ -1056,6 +1095,7 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT const double* TAU, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -1099,6 +1139,7 @@ inline cusolverStatus_t cusolverDnormqr( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } @@ -1120,6 +1161,7 @@ inline cusolverStatus_t cusolverDnormqr( // NOLINT int* devInfo, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } @@ -1153,6 +1195,7 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT int ldc, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } @@ -1171,6 +1214,7 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT int ldc, int* lwork) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } /** @} */ @@ -1209,6 +1253,7 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT size_t* internalDataInBytes, size_t* workspaceInBytes) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverSpScsrqrBufferInfoBatched(handle, m, n, @@ -1238,6 +1283,7 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT size_t* internalDataInBytes, size_t* workspaceInBytes) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverSpDcsrqrBufferInfoBatched(handle, m, n, @@ -1286,6 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverSpScsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1307,6 +1354,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverSpDcsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1345,6 +1393,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT size_t* workspaceInBytesOnHost, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd_bufferSize(handle, params, jobz, @@ -1374,6 +1423,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT size_t* workspaceInBytesOnHost, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd_bufferSize(handle, params, jobz, @@ -1423,6 +1473,7 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT int* info, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd(handle, params, jobz, @@ -1458,6 +1509,7 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT int* info, cudaStream_t stream) { + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd(handle, params, jobz, From a2f670f05d204fbe95ad520f20e770d494373acc Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 2 Feb 2022 14:53:38 -0800 Subject: [PATCH 15/17] resolving errors --- .../raft/linalg/detail/cusolver_wrappers.hpp | 26 ++----------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index acfd239174..34ec6cb673 100644 --- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -180,7 +180,6 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT int lda, int* Lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -193,7 +192,6 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT int lda, int* Lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -275,7 +273,6 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT const float* W, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } @@ -290,7 +287,6 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT const double* W, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -375,7 +371,6 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT int* lwork, syevjInfo_t params) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } @@ -391,7 +386,6 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT int* lwork, syevjInfo_t params) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } /** @} */ @@ -486,7 +480,6 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT const float* W, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevdx_bufferSize( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } @@ -508,7 +501,6 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT const double* W, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevdx_bufferSize( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } @@ -596,10 +588,8 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT int* lwork) { if (std::is_same, float>::value) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvd_bufferSize(handle, m, n, lwork); } } @@ -704,7 +694,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT int* lwork, gesvdjInfo_t params) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvdj_bufferSize( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } @@ -725,7 +714,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT int* lwork, gesvdjInfo_t params) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvdj_bufferSize( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } @@ -820,7 +808,6 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT int lda, int* Lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } @@ -833,7 +820,6 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT int lda, int* Lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } @@ -991,7 +977,6 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT int lda, int* Lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> @@ -1003,7 +988,6 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT int lda, int* Lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -1081,7 +1065,6 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT const float* TAU, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> @@ -1095,7 +1078,6 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT const double* TAU, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -1195,7 +1177,6 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT int ldc, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } @@ -1214,7 +1195,6 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT int ldc, int* lwork) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } /** @} */ @@ -1253,7 +1233,6 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT size_t* internalDataInBytes, size_t* workspaceInBytes) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverSpScsrqrBufferInfoBatched(handle, m, n, @@ -1283,7 +1262,6 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT size_t* internalDataInBytes, size_t* workspaceInBytes) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverSpDcsrqrBufferInfoBatched(handle, m, n, @@ -1332,7 +1310,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream)); return cusolverSpScsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1354,7 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { - RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream)); return cusolverSpDcsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } From 89bf3c1973fc9dd9d38b9de205b365367b7a820e Mon Sep 17 00:00:00 2001 From: divyegala Date: Thu, 3 Feb 2022 19:38:16 -0800 Subject: [PATCH 16/17] adding set stream to cublas set pointer mode --- cpp/include/raft/linalg/detail/cublas_wrappers.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp index 9d8d477355..552bae6b7e 100644 --- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp @@ -1062,6 +1062,7 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, cudaStream_t stream) { + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } /** @} */ From f94beefb95e962e087a9704f7e49f880d8e7085d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 7 Feb 2022 18:25:31 -0500 Subject: [PATCH 17/17] Fixing a bad merge --- cpp/include/raft/linalg/gemv.hpp | 31 +++++++++++++---------- cpp/include/raft/stats/detail/meanvar.cuh | 2 +- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp index 3ff919c1f4..45766b8c9a 100644 --- a/cpp/include/raft/linalg/gemv.hpp +++ b/cpp/include/raft/linalg/gemv.hpp @@ -17,6 +17,9 @@ #pragma once #include "detail/gemv.hpp" +#include + +#include namespace raft { namespace linalg { @@ -57,20 +60,20 @@ void gemv(const raft::handle_t& handle, cudaStream_t stream) { cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublas_device_pointer_mode pmode(cublas_h); - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, - m, - n, - alpha, - A, - lda, - x, - incx, - beta, - y, - incy, - stream)); + detail::cublas_device_pointer_mode pmode(cublas_h); + RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h, + trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, + m, + n, + alpha, + A, + lda, + x, + incx, + beta, + y, + incy, + stream)); } template diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh index ed411ef74d..e3f586fea8 100644 --- a/cpp/include/raft/stats/detail/meanvar.cuh +++ b/cpp/include/raft/stats/detail/meanvar.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft::stats::detail {