diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh index 8384598805..21d04f3f8d 100644 --- a/cpp/include/raft/distance/detail/correlation.cuh +++ b/cpp/include/raft/distance/detail/correlation.cuh @@ -17,7 +17,7 @@ #pragma once #include #include -#include +#include namespace raft { namespace distance { diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh index 3007198f60..bead5f1f71 100644 --- a/cpp/include/raft/distance/detail/cosine.cuh +++ b/cpp/include/raft/distance/detail/cosine.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { namespace distance { diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index 21031afef1..45850de115 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh index a8deb8df24..4786f584c4 100644 --- a/cpp/include/raft/distance/detail/euclidean.cuh +++ b/cpp/include/raft/distance/detail/euclidean.cuh @@ -16,7 +16,7 @@ #pragma once #include -#include +#include namespace raft { namespace distance { diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index 6ad939ecd5..80eb6021ef 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh index 1874d2e942..3cb0469803 100644 --- a/cpp/include/raft/distance/detail/hellinger.cuh +++ b/cpp/include/raft/distance/detail/hellinger.cuh @@ -16,7 +16,7 @@ #pragma once #include -#include +#include namespace raft { namespace distance { diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh index 08911e0350..bfca731443 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh @@ -16,8 +16,8 @@ #pragma once #include #include -#include -#include +#include +#include #include #include diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp index 3dad7ea6d7..935cf6677a 100644 --- a/cpp/include/raft/distance/distance.hpp +++ b/cpp/include/raft/distance/distance.hpp @@ -17,8 +17,8 @@ #pragma once #include +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/linalg/distance_type.h b/cpp/include/raft/distance/distance_type.hpp similarity index 97% rename from cpp/include/raft/linalg/distance_type.h rename to cpp/include/raft/distance/distance_type.hpp index 681a83f3f8..f75263b00d 100644 --- a/cpp/include/raft/linalg/distance_type.h +++ b/cpp/include/raft/distance/distance_type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 6421ba5344..22e9e78ebe 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -33,9 +33,10 @@ //#include #include "cudart_utils.h" + #include -#include -#include +#include +#include #include #include #include diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index dce732b06b..fda4c02a1c 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index a3f2411102..9cd5a29951 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -21,7 +21,7 @@ #include #include -#include +#include namespace raft { namespace label { diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.hpp similarity index 72% rename from cpp/include/raft/linalg/add.cuh rename to cpp/include/raft/linalg/add.hpp index 926cc44197..2f999a45d2 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,12 +16,13 @@ #pragma once -#include "binary_op.cuh" -#include "unary_op.cuh" +#include "detail/add.cuh" namespace raft { namespace linalg { +using detail::adds_scalar; + /** * @brief Elementwise scalar add operation on the input buffer * @@ -39,8 +40,7 @@ namespace linalg { template void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { - auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; - unaryOp(out, in, len, op, stream); + detail::addScalar(out, in, scalar, len, stream); } /** @@ -59,18 +59,7 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s template void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) { - auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; - binaryOp(out, in1, in2, len, op, stream); -} - -template -__global__ void add_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) -{ - IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } + detail::add(out, in1, in2, len, stream); } /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and @@ -90,11 +79,7 @@ void addDevScalar(math_t* outDev, IdxType len, cudaStream_t stream) { - // TODO: block dimension has not been tuned - dim3 block(256); - dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp new file mode 100644 index 0000000000..5c73b6d3c5 --- /dev/null +++ b/cpp/include/raft/linalg/binary_op.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/binary_op.cuh" + +#include + +namespace raft { +namespace linalg { + +/** + * @brief perform element-wise binary operation on the input arrays + * @tparam InType input data-type + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType output data-type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in1 the first input array + * @param in2 the second input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val1, const InType& val2);` + */ +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + detail::binaryOp(out, in1, in2, len, op, stream); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.hpp similarity index 56% rename from cpp/include/raft/linalg/cholesky_r1_update.cuh rename to cpp/include/raft/linalg/cholesky_r1_update.hpp index 1745b0dcc8..583c65c50e 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,7 @@ #pragma once -#include -#include -#include -#include -#include +#include "detail/cholesky_r1_update.hpp" namespace raft { namespace linalg { @@ -132,94 +128,7 @@ void choleskyRank1Update(const raft::handle_t& handle, cudaStream_t stream, math_t eps = -1) { - // The matrix A' is defined as: - // A' = [[A_11, A_12] - // [A_21, A_22]] - // where: - // - A_11 = A, matrix of size (n-1)x(n-1) - // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements - // - A_22 = A_new[n-1] scalar. - // - // Instead of caclulating the Cholelsky decomposition of A' from scratch, - // we just update L with the new row. The new Cholesky decomposition will be - // calculated as: - // L' = [[L_11, 0] - // [L_12, L_22]] - // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and - // L_12 and L_22 are the new quantities that we need to calculate. - - // We need a workspace in device memory to store a scalar. Additionally, in - // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. - const int align = 256; - int offset = - (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; - if (workspace == nullptr) { - *n_bytes = offset + 1 * sizeof(math_t); - return; - } - math_t* s = reinterpret_cast(((char*)workspace) + offset); - math_t* L_22 = L + (n - 1) * ld + n - 1; - - math_t* A_new = nullptr; - math_t* A_row = nullptr; - if (uplo == CUBLAS_FILL_MODE_UPPER) { - // A_new is stored as the n-1 th column of L - A_new = L + (n - 1) * ld; - } else { - // If the input is lower triangular, then the new elements of A are stored - // as the n-th row of L. Since the matrix is column major, this is non - // contiguous. We copy elements from A_row to a contiguous workspace A_new. - A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - RAFT_CUBLAS_TRY( - raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); - } - cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; - if (n > 1) { - // Calculate L_12 = x by solving equation L_11 x = A_12 - math_t alpha = 1; - RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(), - CUBLAS_SIDE_LEFT, - uplo, - op, - CUBLAS_DIAG_NON_UNIT, - n - 1, - 1, - &alpha, - L, - ld, - A_new, - n - 1, - stream)); - - // A_new now stores L_12, we calculate s = L_12 * L_12 - RAFT_CUBLAS_TRY( - raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); - - if (uplo == CUBLAS_FILL_MODE_LOWER) { - // Copy back the L_12 elements as the n-th row of L - RAFT_CUBLAS_TRY( - raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); - } - } else { // n == 1 case - RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); - } - - // L_22 = sqrt(A_22 - L_12 * L_12) - math_t s_host; - math_t L_22_host; - raft::update_host(&s_host, s, 1, stream); - raft::update_host(&L_22_host, L_22, 1, stream); // L_22 stores A_22 - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - L_22_host = std::sqrt(L_22_host - s_host); - - // Check for numeric error with sqrt. If the matrix is not positive definit or - // the system is very ill conditioned then the A_22 - L_12 * L_12 can be - // negative, which would result L_22 = NaN. A small positive eps parameter - // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } - ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); - raft::update_device(L_22, &L_22_host, 1, stream); + detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps); } }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp new file mode 100644 index 0000000000..0f1ca9202d --- /dev/null +++ b/cpp/include/raft/linalg/coalesced_reduction.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/coalesced_reduction.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief Compute reduction of the input matrix along the leading dimension + * + * @tparam InType the data type of the input + * @tparam OutType the data type of the output (as well as the data type for + * which reduction is performed) + * @tparam IdxType data type of the indices of the array + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param dots the output reduction vector + * @param data the input matrix + * @param D leading dimension of data + * @param N second dimension data + * @param init initial value to use for the reduction + * @param main_op elementwise operation to apply before reduction + * @param reduce_op binary reduction operation + * @param final_op elementwise operation to apply before storing results + * @param inplace reduction result added inplace or overwrites old values? + * @param stream cuda stream where to launch work + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp new file mode 100644 index 0000000000..e317588b1d --- /dev/null +++ b/cpp/include/raft/linalg/contractions.hpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/contractions.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief This is the central enum that should be used to configure the perf + * landscape of the Contraction kernel. + * + * Main goal of this Policy struct is to provide sufficient knobs to tune the + * perf of Contraction kernel, as and when we see matrices of different shapes. + * + * @tparam DataT the IO and math datatype + * @tparam _veclen number of k-elements loaded by each thread for every LDG call + * it makes. This should be configured based on the input 'k' + * value and the input data type. For eg: if DataT = float and + * k is multiples of 4, then setting this to 4 gives the best + * LDG pattern. Possible values are {1, 2, 4}. + * @tparam _kblk number of k-elements operated upon per main-loop iteration. + * Therefore total number of main-loop iterations will be + * `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note + * that bigger this value, the greater shared mem requirement. + * @tparam _rpt Defines the number of rows that a given thread accumulates on. + * This directly results in increased register pressure. This + * also is used to compute the number of m-elements worked upon + * by each thread block. + * @tparam _cpt Defines the number of cols that a given thread accumulates on. + * This directly results in increased register pressure. This + * also is used to compute the number of n-elements worked upon + * by each thread block. + * @tparam _tr Number of threads working on the same output column. This is + * used to compute the number of m-elements worked upon by each + * thread block. This also determines the number of threads per + * thread block + * @tparam _tc Number of threads working on the same output row. This is + * used to compute the number of m-elements worked upon by each + * thread block. This also determines the number of threads per + * thread block + */ +template +struct KernelPolicy { + enum { + /** number of elements along K worked upon per main loop iteration */ + Kblk = _kblk, + /** number of elements loaded per LDG */ + Veclen = _veclen, + /** number of rows a thread works on for accumulation */ + AccRowsPerTh = _rpt, + /** number of cols a thread works on for accumulation */ + AccColsPerTh = _cpt, + /** number of threads working the same output col */ + AccThRows = _tr, + /** number of threads working the same output row */ + AccThCols = _tc, + /** total threads per block */ + Nthreads = AccThRows * AccThCols, + /** output tile size along rows */ + Mblk = AccRowsPerTh * AccThRows, + /** output tile size along cols */ + Nblk = AccColsPerTh * AccThCols, + /** number of threads loading a single row */ + LdgThRow = Kblk / Veclen, + /** number of LDGs issued by a single thread for X */ + LdgPerThX = Mblk * LdgThRow / Nthreads, + /** number of LDGs issued by a single thread for Y */ + LdgPerThY = Nblk * LdgThRow / Nthreads, + /** number of rows of X covered per LDG */ + LdgRowsX = Mblk / LdgPerThX, + /** number of rows of Y covered per LDG */ + LdgRowsY = Nblk / LdgPerThY, + /** stride for accessing X/Y data in shared mem */ + SmemStride = Kblk + Veclen, + /** size of one page for storing X data */ + SmemPageX = SmemStride * Mblk, + /** size of one page for storing Y data */ + SmemPageY = SmemStride * Nblk, + /** size of one smem page */ + SmemPage = SmemPageX + SmemPageY, + /** size (in B) for smem needed */ + SmemSize = 2 * SmemPage * sizeof(DataT), + }; // enum + +}; // struct KernelPolicy + +template +struct ColKernelPolicy { + enum { + /** number of elements along K worked upon per main loop iteration */ + Kblk = _kblk, + /** number of elements loaded per LDG */ + Veclen = _veclen, + /** number of rows a thread works on for accumulation */ + AccRowsPerTh = _rpt, + /** number of cols a thread works on for accumulation */ + AccColsPerTh = _cpt, + /** number of threads working the same output col */ + AccThRows = _tr, + /** number of threads working the same output row */ + AccThCols = _tc, + /** total threads per block */ + Nthreads = AccThRows * AccThCols, + /** output tile size along rows */ + Mblk = AccRowsPerTh * AccThRows, + /** output tile size along cols */ + Nblk = AccColsPerTh * AccThCols, + /** number of threads loading a single col */ + LdgThRow = Mblk / Veclen, + /** number of LDGs issued by a single thread for X */ + LdgPerThX = Kblk * LdgThRow / Nthreads, + /** number of LDGs issued by a single thread for Y */ + LdgPerThY = Kblk * LdgThRow / Nthreads, + /** number of rows of X covered per LDG */ + LdgRowsX = Kblk / LdgPerThX, + /** number of rows of Y covered per LDG */ + LdgRowsY = Kblk / LdgPerThY, + /** stride for accessing X/Y data in shared mem */ + SmemStride = Mblk + Veclen, + /** size of one page for storing X data */ + SmemPageX = SmemStride * Kblk, + /** size of one page for storing Y data */ + SmemPageY = SmemStride * Kblk, + /** size of one smem page */ + SmemPage = SmemPageX + SmemPageY, + /** size (in B) for smem needed */ + SmemSize = 2 * SmemPage * sizeof(DataT), + }; // colMajor enum + static_assert(Mblk == Nblk, "Mblk should be equal to Nblk"); +}; +/** + * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32 + * @{ + */ +template +struct Policy4x4 { +}; + +template +struct Policy4x4 { + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; + +template +struct Policy4x4 { + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; +/** @} */ + +/** + * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16 + * @{ + */ +template +struct Policy2x8 { +}; + +template +struct Policy2x8 { + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; + +template +struct Policy2x8 { + // this is not used just for keeping compiler happy. + typedef KernelPolicy Policy; + typedef ColKernelPolicy ColPolicy; +}; +/** @} */ + +/** + * @brief Base class for gemm-like NT contractions + * + * This class does not provide any arithmetic operations, but only provides the + * memory-related operations of loading the `x` and `y` matrix blocks from the + * global memory into shared memory and then from shared into registers. Thus, + * this class acts as a basic building block for further composing gemm-like NT + * contractions on input matrices which are row-major (and so does the output) + * + * @tparam DataT IO and math data type + * @tparam IdxT indexing type + * @tparam Policy policy used to customize memory access behavior. + * See documentation for `KernelPolicy` to know more. + */ +using detail::Contractions_NT; + +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh new file mode 100644 index 0000000000..794a776dcf --- /dev/null +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "functional.cuh" + +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), stream); +} + +template +void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::plus(), stream); +} + +template +__global__ void add_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } +} + +template +void addDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ + // TODO: block dimension has not been tuned + dim3 block(256); + dim3 grid(raft::ceildiv(len, (IdxType)block.x)); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh similarity index 81% rename from cpp/include/raft/linalg/binary_op.cuh rename to cpp/include/raft/linalg/detail/binary_op.cuh index 00a2af0014..6b1f8bc6d7 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,11 @@ #pragma once -#include #include namespace raft { namespace linalg { +namespace detail { template __global__ void binaryOpKernel( @@ -60,22 +60,6 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6 return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } -/** - * @brief perform element-wise binary operation on the input arrays - * @tparam InType input data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam OutType output data-type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in1 the first input array - * @param in2 the second input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - * @note Lambda must be a functor with the following signature: - * `OutType func(const InType& val1, const InType& val2);` - */ template +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ + // The matrix A' is defined as: + // A' = [[A_11, A_12] + // [A_21, A_22]] + // where: + // - A_11 = A, matrix of size (n-1)x(n-1) + // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements + // - A_22 = A_new[n-1] scalar. + // + // Instead of caclulating the Cholelsky decomposition of A' from scratch, + // we just update L with the new row. The new Cholesky decomposition will be + // calculated as: + // L' = [[L_11, 0] + // [L_12, L_22]] + // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and + // L_12 and L_22 are the new quantities that we need to calculate. + + // We need a workspace in device memory to store a scalar. Additionally, in + // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. + const int align = 256; + int offset = + (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; + if (workspace == nullptr) { + *n_bytes = offset + 1 * sizeof(math_t); + return; + } + math_t* s = reinterpret_cast(((char*)workspace) + offset); + math_t* L_22 = L + (n - 1) * ld + n - 1; + + math_t* A_new = nullptr; + math_t* A_row = nullptr; + if (uplo == CUBLAS_FILL_MODE_UPPER) { + // A_new is stored as the n-1 th column of L + A_new = L + (n - 1) * ld; + } else { + // If the input is lower triangular, then the new elements of A are stored + // as the n-th row of L. Since the matrix is column major, this is non + // contiguous. We copy elements from A_row to a contiguous workspace A_new. + A_row = L + n - 1; + A_new = reinterpret_cast(workspace); + RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); + } + cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + if (n > 1) { + // Calculate L_12 = x by solving equation L_11 x = A_12 + math_t alpha = 1; + RAFT_CUBLAS_TRY(cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); + + // A_new now stores L_12, we calculate s = L_12 * L_12 + RAFT_CUBLAS_TRY(cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); + + if (uplo == CUBLAS_FILL_MODE_LOWER) { + // Copy back the L_12 elements as the n-th row of L + RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); + } + } else { // n == 1 case + RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); + } + + // L_22 = sqrt(A_22 - L_12 * L_12) + math_t s_host; + math_t L_22_host; + raft::update_host(&s_host, s, 1, stream); + raft::update_host(&L_22_host, L_22, 1, stream); // L_22 stores A_22 + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + L_22_host = std::sqrt(L_22_host - s_host); + + // Check for numeric error with sqrt. If the matrix is not positive definit or + // the system is very ill conditioned then the A_22 - L_12 * L_12 can be + // negative, which would result L_22 = NaN. A small positive eps parameter + // can be used to prevent this. + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } + ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); + raft::update_device(L_22, &L_22_host, 1, stream); +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh similarity index 71% rename from cpp/include/raft/linalg/coalesced_reduction.cuh rename to cpp/include/raft/linalg/detail/coalesced_reduction.cuh index 717e2c42b2..7e545e4932 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ namespace raft { namespace linalg { +namespace detail { // Kernel (based on norm.cuh) to perform reductions along the coalesced dimension // of the matrix, i.e. reduce along rows for row major or reduce along columns @@ -61,33 +62,6 @@ __global__ void coalescedReductionKernel(OutType* dots, } } -/** - * @brief Compute reduction of the input matrix along the leading dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D leading dimension of data - * @param N second dimension data - * @param init initial value to use for the reduction - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results - * @param inplace reduction result added inplace or overwrites old values? - * @param stream cuda stream where to launch work - */ template -struct KernelPolicy { - enum { - /** number of elements along K worked upon per main loop iteration */ - Kblk = _kblk, - /** number of elements loaded per LDG */ - Veclen = _veclen, - /** number of rows a thread works on for accumulation */ - AccRowsPerTh = _rpt, - /** number of cols a thread works on for accumulation */ - AccColsPerTh = _cpt, - /** number of threads working the same output col */ - AccThRows = _tr, - /** number of threads working the same output row */ - AccThCols = _tc, - /** total threads per block */ - Nthreads = AccThRows * AccThCols, - /** output tile size along rows */ - Mblk = AccRowsPerTh * AccThRows, - /** output tile size along cols */ - Nblk = AccColsPerTh * AccThCols, - /** number of threads loading a single row */ - LdgThRow = Kblk / Veclen, - /** number of LDGs issued by a single thread for X */ - LdgPerThX = Mblk * LdgThRow / Nthreads, - /** number of LDGs issued by a single thread for Y */ - LdgPerThY = Nblk * LdgThRow / Nthreads, - /** number of rows of X covered per LDG */ - LdgRowsX = Mblk / LdgPerThX, - /** number of rows of Y covered per LDG */ - LdgRowsY = Nblk / LdgPerThY, - /** stride for accessing X/Y data in shared mem */ - SmemStride = Kblk + Veclen, - /** size of one page for storing X data */ - SmemPageX = SmemStride * Mblk, - /** size of one page for storing Y data */ - SmemPageY = SmemStride * Nblk, - /** size of one smem page */ - SmemPage = SmemPageX + SmemPageY, - /** size (in B) for smem needed */ - SmemSize = 2 * SmemPage * sizeof(DataT), - }; // enum - -}; // struct KernelPolicy - -template -struct ColKernelPolicy { - enum { - /** number of elements along K worked upon per main loop iteration */ - Kblk = _kblk, - /** number of elements loaded per LDG */ - Veclen = _veclen, - /** number of rows a thread works on for accumulation */ - AccRowsPerTh = _rpt, - /** number of cols a thread works on for accumulation */ - AccColsPerTh = _cpt, - /** number of threads working the same output col */ - AccThRows = _tr, - /** number of threads working the same output row */ - AccThCols = _tc, - /** total threads per block */ - Nthreads = AccThRows * AccThCols, - /** output tile size along rows */ - Mblk = AccRowsPerTh * AccThRows, - /** output tile size along cols */ - Nblk = AccColsPerTh * AccThCols, - /** number of threads loading a single col */ - LdgThRow = Mblk / Veclen, - /** number of LDGs issued by a single thread for X */ - LdgPerThX = Kblk * LdgThRow / Nthreads, - /** number of LDGs issued by a single thread for Y */ - LdgPerThY = Kblk * LdgThRow / Nthreads, - /** number of rows of X covered per LDG */ - LdgRowsX = Kblk / LdgPerThX, - /** number of rows of Y covered per LDG */ - LdgRowsY = Kblk / LdgPerThY, - /** stride for accessing X/Y data in shared mem */ - SmemStride = Mblk + Veclen, - /** size of one page for storing X data */ - SmemPageX = SmemStride * Kblk, - /** size of one page for storing Y data */ - SmemPageY = SmemStride * Kblk, - /** size of one smem page */ - SmemPage = SmemPageX + SmemPageY, - /** size (in B) for smem needed */ - SmemSize = 2 * SmemPage * sizeof(DataT), - }; // colMajor enum - static_assert(Mblk == Nblk, "Mblk should be equal to Nblk"); -}; -/** - * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32 - * @{ - */ -template -struct Policy4x4 { -}; - -template -struct Policy4x4 { - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; - -template -struct Policy4x4 { - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; -/** @} */ - -/** - * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16 - * @{ - */ -template -struct Policy2x8 { -}; - -template -struct Policy2x8 { - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; - -template -struct Policy2x8 { - // this is not used just for keeping compiler happy. - typedef KernelPolicy Policy; - typedef ColKernelPolicy ColPolicy; -}; -/** @} */ - -/** - * @brief Base class for gemm-like NT contractions - * - * This class does not provide any arithmetic operations, but only provides the - * memory-related operations of loading the `x` and `y` matrix blocks from the - * global memory into shared memory and then from shared into registers. Thus, - * this class acts as a basic building block for further composing gemm-like NT - * contractions on input matrices which are row-major (and so does the output) - * - * @tparam DataT IO and math data type - * @tparam IdxT indexing type - * @tparam Policy policy used to customize memory access behavior. - * See documentation for `KernelPolicy` to know more. - */ template struct Contractions_NT { protected: @@ -492,5 +313,6 @@ struct Contractions_NT { }; // struct Contractions_NT +} // namespace detail } // namespace linalg -} // namespace raft +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp similarity index 94% rename from cpp/include/raft/linalg/cublas_wrappers.h rename to cpp/include/raft/linalg/detail/cublas_wrappers.hpp index 246e6466d8..752235d246 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,6 +117,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err) namespace raft { namespace linalg { +namespace detail { /** * Assuming the default CUBLAS_POINTER_MODE_HOST, change it to host or device mode @@ -171,7 +172,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } @@ -185,7 +186,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } /** @} */ @@ -202,7 +203,7 @@ template <> inline cublasStatus_t cublasSwap( cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } @@ -210,7 +211,7 @@ template <> inline cublasStatus_t cublasSwap( cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -228,14 +229,14 @@ template <> inline cublasStatus_t cublasCopy( cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> inline cublasStatus_t cublasCopy( cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } /** @} */ @@ -274,7 +275,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } @@ -293,7 +294,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, int incy, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -327,7 +328,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int lda, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -344,7 +345,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int lda, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } /** @} */ @@ -387,7 +388,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } @@ -408,7 +409,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -454,7 +455,7 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemmBatched(handle, transa, transb, @@ -491,7 +492,7 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemmBatched(handle, transa, transb, @@ -558,7 +559,7 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgemmStridedBatched(handle, transa, transb, @@ -601,7 +602,7 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT int batchCount, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgemmStridedBatched(handle, transa, transb, @@ -648,7 +649,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } @@ -662,7 +663,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } @@ -691,7 +692,7 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -708,7 +709,7 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -749,7 +750,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgelsBatched( handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } @@ -769,7 +770,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT int batchSize, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgelsBatched( handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } @@ -812,7 +813,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } @@ -832,7 +833,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -873,7 +874,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } @@ -893,7 +894,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -930,7 +931,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -948,7 +949,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, int ldc, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } /** @} */ @@ -965,7 +966,7 @@ template <> inline cublasStatus_t cublasnrm2( cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } @@ -973,7 +974,7 @@ template <> inline cublasStatus_t cublasnrm2( cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ @@ -1008,7 +1009,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, int ldb, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } @@ -1027,7 +1028,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, int ldb, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } @@ -1055,7 +1056,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, float* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } @@ -1069,7 +1070,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, double* result, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } /** @} */ @@ -1090,7 +1091,7 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } /** @} */ @@ -1107,7 +1108,7 @@ template <> inline cublasStatus_t cublasscal( cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } @@ -1115,11 +1116,12 @@ template <> inline cublasStatus_t cublasscal( cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle, stream)); + RAFT_CUBLAS_TRY(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } /** @} */ +} // namespace detail } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp similarity index 95% rename from cpp/include/raft/linalg/cusolver_wrappers.h rename to cpp/include/raft/linalg/detail/cusolver_wrappers.hpp index 988e7512d5..34ec6cb673 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,6 +115,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err) namespace raft { namespace linalg { +namespace detail { /** * @defgroup Getrf cusolver getrf operations @@ -142,7 +143,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } @@ -157,7 +158,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } @@ -224,7 +225,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } @@ -241,7 +242,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -323,7 +324,7 @@ inline cusolverStatus_t cusolverDnsyevj( // NOLINT syevjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } @@ -342,7 +343,7 @@ inline cusolverStatus_t cusolverDnsyevj( // NOLINT syevjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } @@ -419,7 +420,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } @@ -436,12 +437,11 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ -#if CUDART_VERSION >= 10010 /** * @defgroup syevdx cusolver syevdx operations * @{ @@ -545,7 +545,7 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSsyevdx( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } @@ -570,12 +570,11 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDsyevdx( handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } /** @} */ -#endif /** * @defgroup svd cusolver svd operations @@ -633,7 +632,7 @@ inline cusolverStatus_t cusolverDngesvd( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvd( handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } @@ -657,7 +656,7 @@ inline cusolverStatus_t cusolverDngesvd( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvd( handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } @@ -757,7 +756,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT gesvdjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgesvdj( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } @@ -781,7 +780,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT gesvdjInfo_t params, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgesvdj( handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } @@ -846,7 +845,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -861,7 +860,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } /** @} */ @@ -894,7 +893,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -910,7 +909,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } /** @} */ @@ -942,7 +941,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> @@ -957,7 +956,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } @@ -1024,7 +1023,7 @@ inline cusolverStatus_t cusolverDnorgqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> @@ -1041,7 +1040,7 @@ inline cusolverStatus_t cusolverDnorgqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } @@ -1122,7 +1121,7 @@ inline cusolverStatus_t cusolverDnormqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } @@ -1144,7 +1143,7 @@ inline cusolverStatus_t cusolverDnormqr( // NOLINT int* devInfo, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } @@ -1311,7 +1310,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream)); return cusolverSpScsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1333,7 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT void* pBuffer, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream)); return cusolverSpDcsrqrsvBatched( handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } @@ -1372,7 +1371,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT size_t* workspaceInBytesOnHost, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd_bufferSize(handle, params, jobz, @@ -1402,7 +1401,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT size_t* workspaceInBytesOnHost, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd_bufferSize(handle, params, jobz, @@ -1452,7 +1451,7 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT int* info, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd(handle, params, jobz, @@ -1488,7 +1487,7 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT int* info, cudaStream_t stream) { - CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); + RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream)); return cusolverDnXsyevd(handle, params, jobz, @@ -1509,5 +1508,6 @@ inline cusolverStatus_t cusolverDnxsyevd( // NOLINT /** @} */ #endif +} // namespace detail } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp new file mode 100644 index 0000000000..c694529fb5 --- /dev/null +++ b/cpp/include/raft/linalg/detail/divide.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "functional.cuh" +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, divides_scalar(scalar), stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/detail/eig.hpp similarity index 86% rename from cpp/include/raft/linalg/eig.cuh rename to cpp/include/raft/linalg/detail/eig.hpp index 200f69a88a..8716b4de29 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ #pragma once +#include "cusolver_wrappers.hpp" #include #include #include #include -#include #include #include #include namespace raft { namespace linalg { +namespace detail { template void eigDC_legacy(const raft::handle_t& handle, @@ -73,19 +74,6 @@ void eigDC_legacy(const raft::handle_t& handle, "This usually occurs when some of the features do not vary enough."); } -/** - * @defgroup eig decomp with divide and conquer method for the column-major - * symmetric matrices - * @param handle raft handle - * @param in the input buffer (symmetric matrix that has real eig values and - * vectors. - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param stream cuda stream - * @{ - */ template void eigDC(const raft::handle_t& handle, const math_t* in, @@ -149,22 +137,6 @@ void eigDC(const raft::handle_t& handle, enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; -#if CUDART_VERSION >= 10010 - -/** - * @defgroup eig decomp with divide and conquer method for the column-major - * symmetric matrices - * @param handle raft handle - * @param in the input buffer (symmetric matrix that has real eig values and - * vectors. - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param n_eig_vals: number of eigenvectors to be generated - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param stream cuda stream - * @{ - */ template void eigSelDC(const raft::handle_t& handle, math_t* in, @@ -256,39 +228,23 @@ void eigSelDC(const raft::handle_t& handle, } } -#endif - -/** - * @defgroup overloaded function for eig decomp with Jacobi method for the - * column-major symmetric matrices (in parameter) - * @param handle: raft handle - * @param n_rows: number of rows of the input - * @param n_cols: number of cols of the input - * @param eig_vectors: eigenvectors - * @param eig_vals: eigen values - * @param tol: error tolerance for the jacobi method. Algorithm stops when the - * error is below tol - * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better - * accuracy. - * @{ - */ template void eigJacobi(const raft::handle_t& handle, const math_t* in, - std::size_t n_rows, - std::size_t n_cols, + int n_rows, + int n_cols, math_t* eig_vectors, math_t* eig_vals, cudaStream_t stream, - math_t tol = 1.e-7, - std::uint32_t sweeps = 15) + math_t tol = 1.e-7, + int sweeps = 15) { cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params)); RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); + RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); int lwork; RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH, @@ -326,5 +282,6 @@ void eigJacobi(const raft::handle_t& handle, RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params)); } -}; // end namespace linalg -}; // end namespace raft +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp new file mode 100644 index 0000000000..b15717f205 --- /dev/null +++ b/cpp/include/raft/linalg/detail/eltwise.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "functional.cuh" + +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, adds_scalar(scalar), stream); +} + +template +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp(out, in, len, multiplies_scalar(scalar), stream); +} + +template +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::plus(), stream); +} + +template +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::minus(), stream); +} + +template +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::multiplies(), stream); +} + +template +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, thrust::divides(), stream); +} + +template +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ + raft::linalg::binaryOp(out, in1, in2, len, divides_check_zero(), stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh new file mode 100644 index 0000000000..067b1565e0 --- /dev/null +++ b/cpp/include/raft/linalg/detail/functional.cuh @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +struct divides_scalar { + public: + divides_scalar(ArgType scalar) : scalar_(scalar) {} + + __host__ __device__ inline ReturnType operator()(ArgType in) { return in / scalar_; } + + private: + ArgType scalar_; +}; + +template +struct adds_scalar { + public: + adds_scalar(ArgType scalar) : scalar_(scalar) {} + + __host__ __device__ inline ReturnType operator()(ArgType in) { return in + scalar_; } + + private: + ArgType scalar_; +}; + +template +struct multiplies_scalar { + public: + multiplies_scalar(ArgType scalar) : scalar_(scalar) {} + + __host__ __device__ inline ReturnType operator()(ArgType in) { return in * scalar_; } + + private: + ArgType scalar_; +}; + +template +struct divides_check_zero { + public: + __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) + { + return (b == static_cast(0)) ? 0.0 : a / b; + } +}; + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/detail/gemm.hpp similarity index 88% rename from cpp/include/raft/linalg/gemm.cuh rename to cpp/include/raft/linalg/detail/gemm.hpp index b5147915ba..8a02b702e5 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,14 @@ #pragma once +#include "cublas_wrappers.hpp" #include #include #include -#include namespace raft { namespace linalg { +namespace detail { /** * @brief the wrapper of cublas gemm function @@ -146,25 +147,6 @@ void gemm(const raft::handle_t& handle, handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } -/** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible - * combinations of operand layouts. - * It computes the following equation: Z = alpha . X * Y + beta . Z - * @tparam T Data type of input/output matrices (float/double) - * @param handle raft handle - * @param z output matrix of size M rows x N columns - * @param x input matrix of size M rows x K columns - * @param y input matrix of size K rows x N columns - * @param _M number of rows of X and Z - * @param _N number of rows of Y and columns of Z - * @param _K number of columns of X and rows of Y - * @param isZColMajor Storage layout of Z. true = col major, false = row major - * @param isXColMajor Storage layout of X. true = col major, false = row major - * @param isYColMajor Storage layout of Y. true = col major, false = row major - * @param stream cuda stream - * @param alpha scalar - * @param beta scalar - */ template void gemm(const raft::handle_t& handle, T* z, @@ -253,5 +235,6 @@ void gemm(const raft::handle_t& handle, cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } -} // end namespace linalg -} // end namespace raft +} // namespace detail +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp new file mode 100644 index 0000000000..991268cf26 --- /dev/null +++ b/cpp/include/raft/linalg/detail/gemv.hpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "cublas_wrappers.hpp" + +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows, + const int n_cols, + const math_t* x, + const int incx, + math_t* y, + const int incy, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ + cublasHandle_t cublas_h = handle.get_cublas_handle(); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + RAFT_CUBLAS_TRY( + cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream)); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ + gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ + math_t alpha = math_t(1); + math_t beta = math_t(0); + + gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ + cublasHandle_t cublas_h = handle.get_cublas_handle(); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + RAFT_CUBLAS_TRY( + cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); +} + +template +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ + math_t alpha = math_t(1); + math_t beta = math_t(0); + gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); +} + +}; // namespace detail +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/detail/init.hpp similarity index 81% rename from cpp/include/raft/linalg/init.h rename to cpp/include/raft/linalg/detail/init.hpp index 03d4f99e90..4718a2cb0e 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/detail/init.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,19 +23,8 @@ namespace raft { namespace linalg { +namespace detail { -namespace { - -/** - * @brief Like Python range. - * - * Fills the output as out[i] = i. - * - * \param [out] out device array, size [end-start] - * \param [in] start of the range - * \param [in] end of range (exclusive) - * \param [in] stream cuda stream - */ template void range(T* out, int start, int end, cudaStream_t stream) { @@ -59,6 +48,7 @@ void range(T* out, int n, cudaStream_t stream) { range(out, 0, n, stream); } -} // unnamed namespace + +} // namespace detail } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp new file mode 100644 index 0000000000..c761c06c14 --- /dev/null +++ b/cpp/include/raft/linalg/detail/lanczos.hpp @@ -0,0 +1,1401 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// for cmath: +#define _USE_MATH_DEFINES + +#include +#include + +#include +#include + +#include "cublas_wrappers.hpp" +#include +#include +#include +#include +#include + +namespace raft { + +using namespace matrix; +using namespace linalg::detail; + +namespace spectral { +namespace detail { + +// curandGeneratorNormalX +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ + return curandGenerateNormal(generator, outputPtr, n, mean, stddev); +} +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ + return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); +} + +// ========================================================= +// Helper functions +// ========================================================= + +/** + * @brief Perform Lanczos iteration + * Lanczos iteration is performed on a shifted matrix A+shift*I. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param iter Pointer to current Lanczos iteration. On exit, the + * variable is set equal to the final Lanczos iteration. + * @param maxIter Maximum Lanczos iteration. This function will + * perform a maximum of maxIter-*iter iterations. + * @param shift Matrix shift. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm (i.e. entry in beta_host) is + * less than tol. + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param alpha_host (Output, host memory, maxIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, maxIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Input/output, device memory, + * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as + * columns of a column-major matrix with dimensions + * n x (maxIter+1). + * @param work_dev (Output, device memory, maxIter entries) + * Workspace. Not needed if full reorthogonalization is disabled. + * @return Zero if successful. Otherwise non-zero. + */ +template +int performLanczosIteration(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful variables + constexpr value_type_t one = 1; + constexpr value_type_t negOne = -1; + constexpr value_type_t zero = 0; + value_type_t alpha; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); + + index_type_t n = A->nrows_; + + // ------------------------------------------------------- + // Compute second Lanczos vector + // ------------------------------------------------------- + if (*iter <= 0) { + *iter = 1; + + // Apply matrix + if (shift != 0) + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); + + // Orthogonalize Lanczos vector + RAFT_CUBLAS_TRY(cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + + alpha = -alpha_host[0]; + RAFT_CUBLAS_TRY(cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + + // Check if Lanczos has converged + if (beta_host[0] <= tol) return 0; + + // Normalize Lanczos vector + alpha = 1 / beta_host[0]; + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + } + + // ------------------------------------------------------- + // Compute remaining Lanczos vectors + // ------------------------------------------------------- + + while (*iter < maxIter) { + ++(*iter); + + // Apply matrix + if (shift != 0) + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + + // Full reorthogonalization + // "Twice is enough" algorithm per Kahan and Parlett + if (reorthogonalize) { + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + } + + // Orthogonalization with 3-term recurrence relation + else { + RAFT_CUBLAS_TRY(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); + + auto alpha = -alpha_host[*iter - 1]; + RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + alpha = -beta_host[*iter - 2]; + RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + } + + // Compute residual + RAFT_CUBLAS_TRY(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); + + // Check if Lanczos has converged + if (beta_host[*iter - 1] <= tol) break; + + // Normalize Lanczos vector + alpha = 1 / beta_host[*iter - 1]; + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + } + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + + return 0; +} + +/** + * @brief Find Householder transform for 3-dimensional system + * Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param v (Input/output, host memory, 3 entries) Input + * 3-dimensional vector. On exit, the vector is set to the + * Householder vector. + * @param Pv (Output, host memory, 1 entry) First entry of P*v + * (here v is the input vector). Either equal to ||v||_2 or + * -||v||_2. + * @param P (Output, host memory, 9 entries) Householder transform + * matrix. Matrix dimensions are 3 x 3. + */ +template +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ + // Compute norm of vector + *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + + // Choose whether to reflect to e_1 or -e_1 + // This choice avoids catastrophic cancellation + if (v[0] >= 0) *Pv = -(*Pv); + v[0] -= *Pv; + + // Normalize Householder vector + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + if (normHouseholder != 0) { + v[0] /= normHouseholder; + v[1] /= normHouseholder; + v[2] /= normHouseholder; + } else { + v[0] = 0; + v[1] = 0; + v[2] = 0; + } + + // Construct Householder matrix + index_type_t i, j; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; +} + +/** + * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix + * The Householder transform is pre-applied to the top three rows + * of the matrix and post-applied to the left three columns. The + * 4 x 4 matrix is intended to contain the bulge that is produced + * in the Francis QR algorithm. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param v (Input, host memory, 3 entries) Householder vector. + * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. + */ +template +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ + // Loop indices + index_type_t i, j; + // Dot product between Householder vector and matrix row/column + value_type_t vDotA; + + // Pre-apply Householder transform + for (j = 0; j < 4; ++j) { + vDotA = 0; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + } + + // Post-apply Householder transform + for (i = 0; i < 4; ++i) { + vDotA = 0; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + } +} + +/** + * @brief Perform one step of Francis QR algorithm + * Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param n Matrix dimension. + * @param shift1 QR algorithm shift. + * @param shift2 QR algorithm shift. + * @param alpha (Input/output, host memory, n entries) Diagonal + * entries of tridiagonal matrix. + * @param beta (Input/output, host memory, n-1 entries) + * Off-diagonal entries of tridiagonal matrix. + * @param V (Input/output, host memory, n*n entries) Orthonormal + * transforms from previous steps of QR algorithm. Matrix + * dimensions are n x n. On exit, the orthonormal transform from + * this Francis QR step is post-applied to the matrix. + * @param work (Output, host memory, 3*n entries) Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Temporary storage of 4x4 bulge and Householder vector + value_type_t bulge[16]; + + // Householder vector + value_type_t householder[3]; + // Householder matrix + value_type_t householderMatrix[3 * 3]; + + // Shifts are roots of the polynomial p(x)=x^2+b*x+c + value_type_t b = -shift1 - shift2; + value_type_t c = shift1 * shift2; + + // Loop indices + index_type_t i, j, pos; + // Temporary variable + value_type_t temp; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute initial Householder transform + householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; + householder[1] = beta[0] * (alpha[0] + alpha[1] + b); + householder[2] = beta[0] * beta[1]; + findHouseholder3(householder, &temp, householderMatrix); + + // Apply initial Householder transform to create bulge + memset(bulge, 0, 16 * sizeof(value_type_t)); + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 3; ++i) { + bulge[IDX(i + 1, i, 4)] = beta[i]; + bulge[IDX(i, i + 1, 4)] = beta[i]; + } + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(value_type_t)); + + // Chase bulge to bottom-right of matrix with Householder transforms + for (pos = 0; pos < n - 4; ++pos) { + // Move to next position + alpha[pos] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = beta[pos + 3]; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = beta[pos + 3]; + bulge[IDX(3, 3, 4)] = alpha[pos + 4]; + + // Apply Householder transform + findHouseholder3(householder, beta + pos, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); + } + + // Apply penultimate Householder transform + // Values in the last row and column are zero + alpha[n - 4] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = 0; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = 0; + bulge[IDX(3, 3, 4)] = 0; + findHouseholder3(householder, beta + n - 4, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); + + // Apply final Householder transform + // Values in the last two rows and columns are zero + alpha[n - 3] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = 0; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); + + // Bulge has been eliminated + alpha[n - 2] = bulge[IDX(0, 0, 4)]; + alpha[n - 1] = bulge[IDX(1, 1, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; + + return 0; +} + +/** + * @brief Perform implicit restart of Lanczos algorithm + * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param n Matrix dimension. + * @param iter Current Lanczos iteration. + * @param iter_new Lanczos iteration after restart. + * @param shiftUpper Pointer (host memory) to upper bound for unwanted + * region. Value is ignored if less than *shiftLower. If a + * stronger upper bound has been found, the value is updated on + * exit. + * @param shiftLower Pointer (host memory) to lower bound for unwanted + * region. Value is ignored if greater than *shiftUpper. If a + * stronger lower bound has been found, the value is updated on + * exit. + * @param alpha_host (Input/output, host memory, iter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Input/output, host memory, iter entries) + * Off-diagonal entries of Lanczos system. + * @param V_host (Output, host memory, iter*iter entries) + * Orthonormal transform used to obtain restarted system. Matrix + * dimensions are iter x iter. + * @param work_host (Output, host memory, 4*iter entries) + * Workspace. + * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (iter+1). + * @param work_dev (Output, device memory, (n+iter)*iter entries) + * Workspace. + * @param smallest_eig specifies whether smallest (true) or largest + * (false) eigenvalues are to be calculated. + * @return error flag. + */ +template +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + constexpr value_type_t zero = 0; + constexpr value_type_t one = 1; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Loop index + index_type_t i; + + // Number of implicit restart steps + // Assumed to be even since each call to Francis algorithm is + // equivalent to two calls of QR algorithm + index_type_t restartSteps = iter - iter_new; + + // Ritz values from Lanczos method + value_type_t* ritzVals_host = work_host + 3 * iter; + // Shifts for implicit restart + value_type_t* shifts_host; + + // Orthonormal matrix for similarity transform + value_type_t* V_dev = work_dev + n * iter; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute Ritz values + memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t)); + Lapack::sterf(iter, ritzVals_host, work_host); + + // Debug: Print largest eigenvalues + // for (int i = iter-iter_new; i < iter; ++i) + // std::cout <<*(ritzVals_host+i)<< " "; + // std::cout < *shiftUpper) { + *shiftUpper = ritzVals_host[iter - 1]; + *shiftLower = ritzVals_host[iter_new]; + } else { + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]); + } + } else { + if (*shiftLower > *shiftUpper) { + *shiftUpper = ritzVals_host[iter - iter_new - 1]; + *shiftLower = ritzVals_host[0]; + } else { + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[0]); + } + } + + // Calculate Chebyshev nodes as shifts + shifts_host = ritzVals_host; + for (i = 0; i < restartSteps; ++i) { + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); + shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); + } + + // Apply Francis QR algorithm to implicitly restart Lanczos + for (i = 0; i < restartSteps; i += 2) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + WARNING("error in implicitly shifted QR algorithm"); + + // Obtain new residual + RAFT_CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + RAFT_CUBLAS_TRY(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); + + // Obtain new Lanczos vectors + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, + n * iter_new * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + + // Normalize residual to obtain new Lanczos vector + RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + + RAFT_CUBLAS_TRY(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); + + auto h_beta = 1 / beta_host[iter_new - 1]; + RAFT_CUBLAS_TRY( + cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + + return 0; +} + +} // namespace detail +} // namespace spectral + +namespace detail { + +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param shift On exit, pointer to matrix shift (estimate for + * largest eigenvalue). + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ + using namespace raft::spectral::detail; + + // Useful constants + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; + + // Matrix dimension + index_type_t n = A->nrows_; + + // Shift for implicit restart + value_type_t shiftUpper; + value_type_t shiftLower; + + // Lanczos iteration counters + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + index_type_t i; + + // Host memory + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // ------------------------------------------------------- + // Variable initialization + // ------------------------------------------------------- + + // Total number of Lanczos iterations + *totalIter = 0; + + // Allocate host memory + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // Compute largest eigenvalue to determine shift + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + + curandSetPseudoRandomGeneratorSeed(randGen, seed); + + // Initialize initial Lanczos vector + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); + value_type_t normQ1; + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + + // Determine largest eigenvalue + + Lapack::sterf(*effIter, alpha_host, beta_host); + *shift = -alpha_host[*effIter - 1]; + + // ------------------------------------------------------- + // Compute eigenvectors of shifted matrix + // ------------------------------------------------------- + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + shiftLower = 1; + shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + index_type_t iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // Obtain desired eigenvalues by applying shift + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; + + // Copy results to device memory + RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); + CHECK_CUDA(stream); + + // Convert eigenvectors from Lanczos basis to standard basis + RAFT_CUBLAS_TRY(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); + + // Clean up and exit + curandDestroyGenerator(randGen); + return 0; +} + +template +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ + using namespace raft::spectral::detail; + + // Matrix dimension + index_type_t n = A.nrows_; + + // Check that parameters are valid + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + // Allocate memory + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); + + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); + + // Perform Lanczos method + index_type_t effIter; + value_type_t shift; + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); + + // Clean up and return + return status; +} + +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @param seed random seed. + * @return error flag. + */ +template +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ + using namespace raft::spectral::detail; + + // Useful constants + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; + + // Matrix dimension + index_type_t n = A->nrows_; + + // Lanczos iteration counters + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + index_type_t i; + + // Host memory + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // ------------------------------------------------------- + // Variable initialization + // ------------------------------------------------------- + + // Total number of Lanczos iterations + *totalIter = 0; + + // Allocate host memory + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // ------------------------------------------------------- + // Compute largest eigenvalue + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + curandSetPseudoRandomGeneratorSeed(randGen, seed); + // Initialize initial Lanczos vector + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); + value_type_t normQ1; + RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + value_type_t shift_val = 0.0; + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + value_type_t shiftLower = 1; + value_type_t shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + index_type_t iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + for (int i = 0; i < restartIter; ++i) { + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; + } + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // note: We need to pick the top nEigVecs eigenvalues + // but effItter can be larger than nEigVecs + // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the + // matrix of size effIter. remember the array is sorted, so it is not needed for smallest + // eigenvalues case because the first ones are the smallest ones + + index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs; + + // Debug : print nEigVecs largest eigenvalues + // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) + // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; + // std::cout < +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ + // Matrix dimension + index_type_t n = A.nrows_; + + // Check that parameters are valid + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); + + // Allocate memory + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); + + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); + + // Perform Lanczos method + index_type_t effIter; + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); + + // Clean up and return + return status; +} + +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh new file mode 100644 index 0000000000..56f1dd6f19 --- /dev/null +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +{ + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } +} + +template +void mapImpl( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + const int nblks = raft::ceildiv(len, (size_t)TPB); + mapKernel + <<>>(out, len, map, in, args...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace detail +} // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh similarity index 54% rename from cpp/include/raft/linalg/map_then_reduce.cuh rename to cpp/include/raft/linalg/detail/map_then_reduce.cuh index 2fa3ae72de..281861b2f9 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ namespace raft { namespace linalg { +namespace detail { struct sum_tag { }; @@ -91,67 +92,6 @@ void mapThenReduceImpl(OutType* out, RAFT_CUDA_TRY(cudaPeekAtLastError()); } -/** - * @brief CUDA version of map and then sum reduction operation - * @tparam Type data-type upon which the math operation will be performed - * @tparam MapOp the device-lambda performing the actual operation - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Args additional parameters - * @param out the output sum-reduced value (assumed to be a device pointer) - * @param len number of elements in the input array - * @param map the device-lambda - * @param stream cuda-stream where to launch this kernel - * @param in the input array - * @param args additional input arrays - */ - -template -void mapThenSumReduce( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ - mapThenReduceImpl( - out, len, (OutType)0, map, sum_tag(), stream, in, args...); -} - -/** - * @brief CUDA version of map and then generic reduction operation - * @tparam Type data-type upon which the math operation will be performed - * @tparam MapOp the device-lambda performing the actual map operation - * @tparam ReduceLambda the device-lambda performing the actual reduction - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Args additional parameters - * @param out the output reduced value (assumed to be a device pointer) - * @param len number of elements in the input array - * @param neutral The neutral element of the reduction operation. For example: - * 0 for sum, 1 for multiply, +Inf for Min, -Inf for Max - * @param map the device-lambda - * @param op the reduction device lambda - * @param stream cuda-stream where to launch this kernel - * @param in the input array - * @param args additional input arrays - */ - -template -void mapThenReduce(OutType* out, - size_t len, - OutType neutral, - MapOp map, - ReduceLambda op, - cudaStream_t stream, - const InType* in, - Args... args) -{ - mapThenReduceImpl( - out, len, neutral, map, op, stream, in, args...); -} +}; // end namespace detail }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh new file mode 100644 index 0000000000..94545e59f6 --- /dev/null +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +namespace { +template +struct AlignedAccess { + template + static inline bool test(const T* matrix, size_t strideBytes) + { + return Pow2::isAligned(matrix) && Pow2::isAligned(strideBytes) && + Pow2::isAligned(VecBytes); + } +}; +}; // namespace + +template +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ + typedef TxN_t VecType; + IdxType len = N * D; + IdxType idx = threadIdx.x; + idx += (IdxType)blockIdx.x * (IdxType)blockDim.x; + idx *= VecType::Ratio; + if (idx >= len) return; + IdxType vIdx; + VecType mat, vec; + ///@todo: yikes! use fast-int-div here. + ///@todo: shared mem for vector could help with perf + if (rowMajor && bcastAlongRows) { + vIdx = idx % D; + vec.load(vector, vIdx); + } else if (!rowMajor && !bcastAlongRows) { + vIdx = idx % N; + vec.load(vector, vIdx); + } else if (rowMajor && !bcastAlongRows) { + vIdx = idx / D; + vec.fill(vector[vIdx]); + } else { + vIdx = idx / N; + vec.fill(vector[vIdx]); + } + mat.load(matrix, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) + mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]); + mat.store(out, idx); +} + +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType len = N * D; + IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); + matrixVectorOpKernel + <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType stride = rowMajor ? D : N; + IdxType nLines = rowMajor ? N : D; + return matrix::linewiseOp( + out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec); +} + +template +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType stride = rowMajor ? D : N; + IdxType nLines = rowMajor ? N : D; + return matrix::linewiseOp( + out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp new file mode 100644 index 0000000000..f0a9daebdb --- /dev/null +++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void meanSquaredError( + math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) +{ + auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { + math_t diff = a - b; + return diff * diff * weight / len; + }; + raft::linalg::mapThenSumReduce(out, len, sq_diff, stream, A, B); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp new file mode 100644 index 0000000000..da06c23aed --- /dev/null +++ b/cpp/include/raft/linalg/detail/multiply.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp new file mode 100644 index 0000000000..fcf98c7daf --- /dev/null +++ b/cpp/include/raft/linalg/detail/norm.hpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +/** different types of norms supported on the input buffers */ +enum NormType { L1Norm = 0, L2Norm }; + +template +void rowNormCaller(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op) +{ + switch (type) { + case L1Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); + break; + case L2Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); + break; + default: ASSERT(false, "Invalid norm type passed! [%d]", type); + }; +} + +template +void colNormCaller(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op) +{ + switch (type) { + case L1Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); + break; + case L2Norm: + raft::linalg::reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); + break; + default: ASSERT(false, "Invalid norm type passed! [%d]", type); + }; +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh similarity index 77% rename from cpp/include/raft/linalg/qr.cuh rename to cpp/include/raft/linalg/detail/qr.cuh index 2870d6d072..a250dd3578 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,30 +16,16 @@ #pragma once -#include -#include +#include "cublas_wrappers.hpp" +#include "cusolver_wrappers.hpp" #include #include #include namespace raft { namespace linalg { +namespace detail { -/** - * @defgroup QRdecomp QR decomposition - * @{ - */ - -/** - * @brief compute QR decomp and return only Q matrix - * @param handle: raft handle - * @param M: input matrix - * @param Q: Q matrix to be returned (on GPU) - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param stream cuda stream - * @{ - */ template void qrGetQ(const raft::handle_t& handle, const math_t* M, @@ -64,26 +50,13 @@ void qrGetQ(const raft::handle_t& handle, rmm::device_uvector workspace(Lwork, stream); RAFT_CUSOLVER_TRY(cusolverDngeqrf( cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); - /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. -#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif + RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); RAFT_CUSOLVER_TRY(cusolverDnorgqr( cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); } -/** - * @brief compute QR decomp and return both Q and R matrices - * @param handle: raft handle - * @param M: input matrix - * @param Q: Q matrix to be returned (on GPU) - * @param R: R matrix to be returned (on GPU) - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param stream cuda stream - */ template void qrGetQR(const raft::handle_t& handle, math_t* M, @@ -119,10 +92,6 @@ void qrGetQR(const raft::handle_t& handle, Lwork, devInfo.data(), stream)); - // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. -#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); -#endif raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); @@ -145,7 +114,7 @@ void qrGetQR(const raft::handle_t& handle, devInfo.data(), stream)); } -/** @} */ +}; // namespace detail }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp new file mode 100644 index 0000000000..94c8f5ba52 --- /dev/null +++ b/cpp/include/raft/linalg/detail/reduce.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void reduce(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + bool rowMajor, + bool alongRows, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + if (rowMajor && alongRows) { + raft::linalg::coalescedReduction( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + } else if (rowMajor && !alongRows) { + raft::linalg::stridedReduction( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + } else if (!rowMajor && alongRows) { + raft::linalg::stridedReduction( + dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + } else { + raft::linalg::coalescedReduction( + dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + } +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh similarity index 80% rename from cpp/include/raft/linalg/strided_reduction.cuh rename to cpp/include/raft/linalg/detail/strided_reduction.cuh index 0434f87151..a0d1e2abaa 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,12 @@ #include "unary_op.cuh" #include #include +#include #include namespace raft { namespace linalg { +namespace detail { // Kernel to perform reductions along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows @@ -102,33 +104,6 @@ __global__ void stridedReductionKernel(OutType* dots, raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op); } -/** - * @brief Compute reduction of the input matrix along the strided dimension - * - * @tparam InType the data type of the input - * @tparam OutType the data type of the output (as well as the data type for - * which reduction is performed) - * @tparam IdxType data type of the indices of the array - * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*MainLambda)(InType, IdxType);
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*ReduceLambda)(OutType);
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) - * It must be a 'callable' supporting the following input and output: - *
OutType (*FinalLambda)(OutType);
- * @param dots the output reduction vector - * @param data the input matrix - * @param D leading dimension of data - * @param N second dimension data - * @param init initial value to use for the reduction - * @param main_op elementwise operation to apply before reduction - * @param reduce_op binary reduction operation - * @param final_op elementwise operation to apply before storing results - * @param inplace reduction result added inplace or overwrites old values? - * @param stream cuda stream where to launch work - */ template +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ + auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; + raft::linalg::unaryOp(out, in, len, op, stream); +} + +template +void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ + auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; + raft::linalg::binaryOp(out, in1, in2, len, op, stream); +} + +template +__global__ void subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + // TODO: kernel do not use shared memory in current implementation + int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; + if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } +} + +template +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ + // Just for the note - there is no way to express such operation with cuBLAS in effective way + // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda + const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); + subtract_dev_scalar_kernel + <<>>(outDev, inDev, singleScalarDev, len); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/detail/svd.hpp similarity index 59% rename from cpp/include/raft/linalg/svd.cuh rename to cpp/include/raft/linalg/detail/svd.hpp index 2afae788a1..796adc89ff 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,15 +16,16 @@ #pragma once -#include "eig.cuh" -#include "gemm.cuh" -#include "transpose.h" +#include "cublas_wrappers.hpp" +#include "cusolver_wrappers.hpp" +#include +#include +#include + #include #include #include #include -#include -#include #include #include #include @@ -32,25 +33,8 @@ namespace raft { namespace linalg { +namespace detail { -/** - * @brief singular value decomposition (SVD) on the column major float type - * input matrix using QR method - * @param handle: raft handle - * @param in: input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param sing_vals: singular values of input matrix - * @param left_sing_vecs: left singular values of input matrix - * @param right_sing_vecs: right singular values of input matrix - * @param trans_right: transpose right vectors or not - * @param gen_left_vec: generate left eig vector. Not activated. - * @param gen_right_vec: generate right eig vector. Not activated. - * @param stream cuda stream - */ -// TODO: activate gen_left_vec and gen_right_vec options -// TODO: couldn't template this function due to cusolverDnSgesvd and -// cusolverSnSgesvd. Check if there is any other way. template void svdQR(const raft::handle_t& handle, T* in, @@ -69,15 +53,6 @@ void svdQR(const raft::handle_t& handle, cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); -#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 - // 46340: sqrt of max int value - ASSERT(n_rows <= 46340, - "svd solver is not supported for the data that has more than 46340 " - "samples (rows) " - "if you are using CUDA version <11. Please use other solvers such as " - "eig if it is available."); -#endif - const int m = n_rows; const int n = n_cols; @@ -167,7 +142,7 @@ void svdEig(const raft::handle_t& handle, beta, stream); - eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); + raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); raft::matrix::colReverse(V, n_cols, n_cols, stream); raft::matrix::rowReverse(S, n_cols, 1, stream); @@ -192,23 +167,6 @@ void svdEig(const raft::handle_t& handle, } } -/** - * @brief on the column major input matrix using Jacobi method - * @param handle: raft handle - * @param in: input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param sing_vals: singular values of input matrix - * @param left_sing_vecs: left singular vectors of input matrix - * @param right_sing_vecs: right singular vectors of input matrix - * @param gen_left_vec: generate left eig vector. Not activated. - * @param gen_right_vec: generate right eig vector. Not activated. - * @param tol: error tolerance for the jacobi method. Algorithm stops when the - * error is below tol - * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better - * accuracy. - * @param stream cuda stream - */ template void svdJacobi(const raft::handle_t& handle, math_t* in, @@ -241,57 +199,44 @@ void svdJacobi(const raft::handle_t& handle, int lwork = 0; int econ = 1; - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - &lwork, - gesvdj_params)); + RAFT_CUSOLVER_TRY(cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); rmm::device_uvector d_work(lwork, stream); - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - d_work.data(), - lwork, - devInfo.data(), - gesvdj_params, - stream)); + RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } -/** - * @brief reconstruct a matrix use left and right singular vectors and - * singular values - * @param handle: raft handle - * @param U: left singular vectors of size n_rows x k - * @param S: square matrix with singular values on its diagonal, k x k - * @param V: right singular vectors of size n_cols x k - * @param out: reconstructed matrix to be returned - * @param n_rows: number rows of output matrix - * @param n_cols: number columns of output matrix - * @param k: number of singular values - * @param stream cuda stream - */ template void svdReconstruction(const raft::handle_t& handle, math_t* U, @@ -323,20 +268,6 @@ void svdReconstruction(const raft::handle_t& handle, stream); } -/** - * @brief reconstruct a matrix use left and right singular vectors and - * singular values - * @param handle: raft handle - * @param A_d: input matrix - * @param U: left singular vectors of size n_rows x k - * @param S_vec: singular values as a vector - * @param V: right singular vectors of size n_cols x k - * @param n_rows: number rows of output matrix - * @param n_cols: number columns of output matrix - * @param k: number of singular values to be computed, 1.0 for normal SVD - * @param tol: tolerance for the evaluation - * @param stream cuda stream - */ template bool evaluateSVDByL2Norm(const raft::handle_t& handle, math_t* A_d, @@ -374,25 +305,26 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle, rmm::device_uvector A_minus_P(m * n, stream); RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH, - CUBLAS_OP_N, - CUBLAS_OP_N, - m, - n, - &alpha, - A_d, - m, - &beta, - P_d.data(), - m, - A_minus_P.data(), - m, - stream)); + RAFT_CUBLAS_TRY(cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); math_t percent_error = 100.0 * norm_A_minus_P / normA; return (percent_error / 100.0 < tol); } +}; // end namespace detail }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/detail/transpose.hpp similarity index 56% rename from cpp/include/raft/linalg/transpose.h rename to cpp/include/raft/linalg/detail/transpose.hpp index 6b629e73f0..659d3a8ef6 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/detail/transpose.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +16,15 @@ #pragma once +#include "cublas_wrappers.hpp" + #include -#include #include namespace raft { namespace linalg { +namespace detail { -/** - * @brief transpose on the column major input matrix using Jacobi method - * @param handle: raft handle - * @param in: input matrix - * @param out: output. Transposed input matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param stream: cuda stream - */ template void transpose(const raft::handle_t& handle, math_t* in, @@ -47,28 +40,22 @@ void transpose(const raft::handle_t& handle, const math_t alpha = 1.0; const math_t beta = 0.0; - RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_n_rows, - out_n_cols, - &alpha, - in, - n_rows, - &beta, - out, - out_n_rows, - out, - out_n_rows, - stream)); + RAFT_CUBLAS_TRY(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); } -/** - * @brief transpose on the column major input matrix using Jacobi method - * @param inout: input and output matrix - * @param n: number of rows and columns of input matrix - * @param stream: cuda stream - */ template void transpose(math_t* inout, int n, cudaStream_t stream) { @@ -90,5 +77,6 @@ void transpose(math_t* inout, int n, cudaStream_t stream) }); } +}; // end namespace detail }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh similarity index 69% rename from cpp/include/raft/linalg/unary_op.cuh rename to cpp/include/raft/linalg/detail/unary_op.cuh index ae8cff2325..9ddfe79657 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ namespace raft { namespace linalg { +namespace detail { template __global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) @@ -50,27 +51,12 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr RAFT_CUDA_TRY(cudaPeekAtLastError()); } -/** - * @brief perform element-wise unary operation in the input array - * @tparam InType input data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam OutType output data-type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in the input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - * @note Lambda must be a functor with the following signature: - * `OutType func(const InType& val);` - */ template -void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) { if (len <= 0) return; // silently skip in case of 0 length input constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); @@ -99,25 +85,8 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) if (idx < len) { op(out + idx, idx); } } -/** - * @brief Perform an element-wise unary operation into the output array - * - * Compared to `unaryOp()`, this method does not do any reads from any inputs - * - * @tparam OutType output data-type - * @tparam Lambda the device-lambda performing the actual operation - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * - * @param[out] out the output array [on device] [len = len] - * @param[in] len number of elements in the input array - * @param[in] op the device-lambda which must be of the form: - * `void func(OutType* outLocationOffset, IdxType idx);` - * where outLocationOffset will be out + idx. - * @param[in] stream cuda stream where to launch work - */ template -void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream) { if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); @@ -125,5 +94,6 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) RAFT_CUDA_TRY(cudaGetLastError()); } +}; // end namespace detail }; // end namespace linalg }; // end namespace raft diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.hpp similarity index 88% rename from cpp/include/raft/linalg/divide.cuh rename to cpp/include/raft/linalg/divide.hpp index 562a3d8991..6c8480bf19 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,13 @@ #pragma once -#include "unary_op.cuh" +#include "detail/divide.hpp" namespace raft { namespace linalg { +using detail::divides_scalar; + /** * @defgroup ScalarOps Scalar operations on the input buffer * @tparam math_t data-type upon which the math operation will be performed @@ -35,8 +37,7 @@ namespace linalg { template void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) { - unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream); + detail::divideScalar(out, in, scalar, len, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp new file mode 100644 index 0000000000..5c465a3a41 --- /dev/null +++ b/cpp/include/raft/linalg/eig.hpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/eig.hpp" + +namespace raft { +namespace linalg { + +/** + * @defgroup eig Eigen Decomposition Methods + * @{ + */ + +/** + * @brief eig decomp with divide and conquer method for the column-major + * symmetric matrices + * @param handle raft handle + * @param in the input buffer (symmetric matrix that has real eig values and + * vectors. + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param stream cuda stream + */ +template +void eigDC(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ + detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); +} + +using detail::COPY_INPUT; +using detail::EigVecMemUsage; +using detail::OVERWRITE_INPUT; + +/** + * @brief eig sel decomp with divide and conquer method for the column-major + * symmetric matrices + * @param handle raft handle + * @param in the input buffer (symmetric matrix that has real eig values and + * vectors. + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param n_eig_vals: number of eigenvectors to be generated + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param memUsage: the memory selection for eig vector output + * @param stream cuda stream + */ +template +void eigSelDC(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + int n_eig_vals, + math_t* eig_vectors, + math_t* eig_vals, + EigVecMemUsage memUsage, + cudaStream_t stream) +{ + detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream); +} + +/** + * @brief overloaded function for eig decomp with Jacobi method for the + * column-major symmetric matrices (in parameter) + * @param handle: raft handle + * @param in: input matrix + * @param n_rows: number of rows of the input + * @param n_cols: number of cols of the input + * @param eig_vectors: eigenvectors + * @param eig_vals: eigen values + * @param stream: stream on which this function will be run + * @param tol: error tolerance for the jacobi method. Algorithm stops when the + * error is below tol + * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better + * accuracy. + */ +template +void eigJacobi(const raft::handle_t& handle, + const math_t* in, + int n_rows, + int n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream, + math_t tol = 1.e-7, + int sweeps = 15) +{ + detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps); +} +/** @} */ // end of eig + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.hpp similarity index 75% rename from cpp/include/raft/linalg/eltwise.cuh rename to cpp/include/raft/linalg/eltwise.hpp index 097c3ac218..5c2a97b57d 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,12 +16,13 @@ #pragma once -#include "binary_op.cuh" -#include "unary_op.cuh" +#include "detail/eltwise.hpp" namespace raft { namespace linalg { +using detail::adds_scalar; + /** * @defgroup ScalarOps Scalar operations on the input buffer * @tparam InType data-type upon which the math operation will be performed @@ -36,15 +37,15 @@ namespace linalg { template void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream); + detail::scalarAdd(out, in, scalar, len, stream); } +using detail::multiplies_scalar; + template void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) { - raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream); + detail::scalarMultiply(out, in, scalar, len, stream); } /** @} */ @@ -63,50 +64,37 @@ template void eltwiseAdd( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); + detail::eltwiseAdd(out, in1, in2, len, stream); } template void eltwiseSub( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream); + detail::eltwiseSub(out, in1, in2, len, stream); } template void eltwiseMultiply( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream); + detail::eltwiseMultiply(out, in1, in2, len, stream); } template void eltwiseDivide( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream); + detail::eltwiseDivide(out, in1, in2, len, stream); } +using detail::divides_check_zero; + template void eltwiseDivideCheckZero( OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) { - binaryOp( - out, - in1, - in2, - len, - [] __device__(InType a, InType b) { - if (b == InType(0.0)) - return InType(0.0); - else - return a / b; - }, - stream); + detail::eltwiseDivideCheckZero(out, in1, in2, len, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp new file mode 100644 index 0000000000..04ddbb3561 --- /dev/null +++ b/cpp/include/raft/linalg/gemm.hpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/gemm.hpp" + +namespace raft { +namespace linalg { + +/** + * @brief the wrapper of cublas gemm function + * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C + * @tparam math_t the type of input/output matrices + * @param handle raft handle + * @param a input matrix + * @param n_rows_a number of rows of A + * @param n_cols_a number of columns of A + * @param b input matrix + * @param c output matrix + * @param n_rows_c number of rows of C + * @param n_cols_c number of columns of C + * @param trans_a cublas transpose op for A + * @param trans_b cublas transpose op for B + * @param alpha scalar + * @param beta scalar + * @param stream cuda stream + */ +template +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ + detail::gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); +} + +/** + * @brief the wrapper of cublas gemm function + * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C + * @tparam math_t the type of input/output matrices + * @param handle raft handle + * @param a input matrix + * @param n_rows_a number of rows of A + * @param n_cols_a number of columns of A + * @param b input matrix + * @param c output matrix + * @param n_rows_c number of rows of C + * @param n_cols_c number of columns of C + * @param trans_a cublas transpose op for A + * @param trans_b cublas transpose op for B + * @param stream cuda stream + */ +template +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ + detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream); +} + +/** + * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * combinations of operand layouts. + * It computes the following equation: Z = alpha . X * Y + beta . Z + * @tparam T Data type of input/output matrices (float/double) + * @param handle raft handle + * @param z output matrix of size M rows x N columns + * @param x input matrix of size M rows x K columns + * @param y input matrix of size K rows x N columns + * @param _M number of rows of X and Z + * @param _N number of rows of Y and columns of Z + * @param _K number of columns of X and rows of Y + * @param isZColMajor Storage layout of Z. true = col major, false = row major + * @param isXColMajor Storage layout of X. true = col major, false = row major + * @param isYColMajor Storage layout of Y. true = col major, false = row major + * @param stream cuda stream + * @param alpha scalar + * @param beta scalar + */ +template +void gemm(const raft::handle_t& handle, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + T alpha = T(1.0), + T beta = T(0.0)) +{ + detail::gemm( + handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta); +} + +} // end namespace linalg +} // end namespace raft diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.hpp similarity index 73% rename from cpp/include/raft/linalg/gemv.h rename to cpp/include/raft/linalg/gemv.hpp index 9eafb3941a..45766b8c9a 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,8 @@ #pragma once -#include -#include -#include +#include "detail/gemv.hpp" +#include #include @@ -61,20 +60,20 @@ void gemv(const raft::handle_t& handle, cudaStream_t stream) { cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublas_device_pointer_mode pmode(cublas_h); - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, - m, - n, - alpha, - A, - lda, - x, - incx, - beta, - y, - incy, - stream)); + detail::cublas_device_pointer_mode pmode(cublas_h); + RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h, + trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, + m, + n, + alpha, + A, + lda, + x, + incx, + beta, + y, + incy, + stream)); } template @@ -99,16 +98,17 @@ void gemv(const raft::handle_t& handle, * * where * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * - * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. - * set it when you need to use only the first n_rows_a rows of the matrix A, which has - * (perhaps, due to padding) lda rows. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param alpha is a scalar scale of Ax. + * @param beta is a scalar scale of y. + * @param stream stream on which this function is run */ template void gemv(const raft::handle_t& handle, @@ -122,7 +122,7 @@ void gemv(const raft::handle_t& handle, const math_t beta, cudaStream_t stream) { - gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); + detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream); } /** @@ -130,12 +130,15 @@ void gemv(const raft::handle_t& handle, * * where * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param stream stream on which this function is run */ template void gemv(const raft::handle_t& handle, @@ -147,31 +150,27 @@ void gemv(const raft::handle_t& handle, const bool trans_a, cudaStream_t stream) { - math_t alpha = math_t(1); - math_t beta = math_t(0); - - gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); + detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream); } /** * y = alpha * op(A) * x + beta * y * * where - * - * @param alpha is a scalar scale of Ax. - * - * @param beta is a scalar scale of y. - * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. * set it when you need to use only the first n_rows_a rows of the matrix A, which has * (perhaps, due to padding) lda rows. - * * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param alpha is a scalar scale of Ax. + * @param beta is a scalar scale of y. + * @param stream stream on which this function is run */ template void gemv(const raft::handle_t& handle, @@ -186,27 +185,25 @@ void gemv(const raft::handle_t& handle, const math_t beta, cudaStream_t stream) { - cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - RAFT_CUBLAS_TRY( - cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); + detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); } /** * y = op(A) * x * * where - * + * @param handle raft handle * @param A is a column-major matrix of size n_rows_a * n_cols_a. * op(A) is either the transpose operation (trans_a == true) or identity. - * + * @param n_rows_a number of rows in A + * @param n_cols_a number of cols in A * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a. * set it when you need to use only the first n_rows_a rows of the matrix A, which has * (perhaps, due to padding) lda rows. - * * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`. - * * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. + * @param trans_a whether to take transpose of a + * @param stream stream on which this function is run * */ template @@ -220,9 +217,7 @@ void gemv(const raft::handle_t& handle, const bool trans_a, cudaStream_t stream) { - math_t alpha = math_t(1); - math_t beta = math_t(0); - gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); + detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream); } }; // namespace linalg diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp new file mode 100644 index 0000000000..bb577672e8 --- /dev/null +++ b/cpp/include/raft/linalg/init.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/init.hpp" + +namespace raft { +namespace linalg { + +/** + * @brief Like Python range. + * + * Fills the output as out[i] = i. + * + * \param [out] out device array, size [end-start] + * \param [in] start of the range + * \param [in] end of range (exclusive) + * \param [in] stream cuda stream + */ +template +void range(T* out, int start, int end, cudaStream_t stream) +{ + detail::range(out, start, end, stream); +} + +/** + * @brief Like Python range. + * + * Fills the output as out[i] = i. + * + * \param [out] out device array, size [n] + * \param [in] n length of the array + * \param [in] stream cuda stream + */ +template +void range(T* out, int n, cudaStream_t stream) +{ + detail::range(out, n, stream); +} + +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 9376994742..e7d965f810 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,982 +16,14 @@ #pragma once -// for cmath: -#define _USE_MATH_DEFINES - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include +#include "detail/lanczos.hpp" namespace raft { -using namespace matrix; -using namespace linalg; - -namespace spectral { - -// curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX( - curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) -{ - return curandGenerateNormal(generator, outputPtr, n, mean, stddev); -} -inline curandStatus_t curandGenerateNormalX( - curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) -{ - return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); -} - -// ========================================================= -// Helper functions -// ========================================================= - -/** - * @brief Perform Lanczos iteration - * Lanczos iteration is performed on a shifted matrix A+shift*I. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param iter Pointer to current Lanczos iteration. On exit, the - * variable is set equal to the final Lanczos iteration. - * @param maxIter Maximum Lanczos iteration. This function will - * perform a maximum of maxIter-*iter iterations. - * @param shift Matrix shift. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm (i.e. entry in beta_host) is - * less than tol. - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param alpha_host (Output, host memory, maxIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, maxIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Input/output, device memory, - * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as - * columns of a column-major matrix with dimensions - * n x (maxIter+1). - * @param work_dev (Output, device memory, maxIter entries) - * Workspace. Not needed if full reorthogonalization is disabled. - * @return Zero if successful. Otherwise non-zero. - */ -template -int performLanczosIteration(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t* iter, - index_type_t maxIter, - value_type_t shift, - value_type_t tol, - bool reorthogonalize, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Useful variables - constexpr value_type_t one = 1; - constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; - value_type_t alpha; - - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); - - index_type_t n = A->nrows_; - - // ------------------------------------------------------- - // Compute second Lanczos vector - // ------------------------------------------------------- - if (*iter <= 0) { - *iter = 1; - - // Apply matrix - if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, - lanczosVecs_dev, - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); - - // Orthogonalize Lanczos vector - RAFT_CUBLAS_TRY(cublasdot( - cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); - - alpha = -alpha_host[0]; - RAFT_CUBLAS_TRY(cublasaxpy( - cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); - - // Check if Lanczos has converged - if (beta_host[0] <= tol) return 0; - - // Normalize Lanczos vector - alpha = 1 / beta_host[0]; - RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - } - - // ------------------------------------------------------- - // Compute remaining Lanczos vectors - // ------------------------------------------------------- - - while (*iter < maxIter) { - ++(*iter); - - // Apply matrix - if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, - lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); - - // Full reorthogonalization - // "Twice is enough" algorithm per Kahan and Parlett - if (reorthogonalize) { - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(value_type_t), - cudaMemcpyDeviceToHost, - stream)); - - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - } - - // Orthogonalization with 3-term recurrence relation - else { - RAFT_CUBLAS_TRY(cublasdot(cublas_h, - n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1), - stream)); - - auto alpha = -alpha_host[*iter - 1]; - RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - - alpha = -beta_host[*iter - 2]; - RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - } - - // Compute residual - RAFT_CUBLAS_TRY(cublasnrm2( - cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); - - // Check if Lanczos has converged - if (beta_host[*iter - 1] <= tol) break; - - // Normalize Lanczos vector - alpha = 1 / beta_host[*iter - 1]; - RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); - } - - CUDA_TRY(cudaStreamSynchronize(stream)); - - return 0; -} - -/** - * @brief Find Householder transform for 3-dimensional system - * Given an input vector v=[x,y,z]', this function finds a - * Householder transform P such that P*v is a multiple of - * e_1=[1,0,0]'. The input vector v is overwritten with the - * Householder vector such that P=I-2*v*v'. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param v (Input/output, host memory, 3 entries) Input - * 3-dimensional vector. On exit, the vector is set to the - * Householder vector. - * @param Pv (Output, host memory, 1 entry) First entry of P*v - * (here v is the input vector). Either equal to ||v||_2 or - * -||v||_2. - * @param P (Output, host memory, 9 entries) Householder transform - * matrix. Matrix dimensions are 3 x 3. - */ -template -static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) -{ - // Compute norm of vector - *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); - - // Choose whether to reflect to e_1 or -e_1 - // This choice avoids catastrophic cancellation - if (v[0] >= 0) *Pv = -(*Pv); - v[0] -= *Pv; - - // Normalize Householder vector - value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); - if (normHouseholder != 0) { - v[0] /= normHouseholder; - v[1] /= normHouseholder; - v[2] /= normHouseholder; - } else { - v[0] = 0; - v[1] = 0; - v[2] = 0; - } - - // Construct Householder matrix - index_type_t i, j; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) - P[IDX(i, i, 3)] += 1; -} - -/** - * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix - * The Householder transform is pre-applied to the top three rows - * of the matrix and post-applied to the left three columns. The - * 4 x 4 matrix is intended to contain the bulge that is produced - * in the Francis QR algorithm. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param v (Input, host memory, 3 entries) Householder vector. - * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. - */ -template -static void applyHouseholder3(const value_type_t* v, value_type_t* A) -{ - // Loop indices - index_type_t i, j; - // Dot product between Householder vector and matrix row/column - value_type_t vDotA; - - // Pre-apply Householder transform - for (j = 0; j < 4; ++j) { - vDotA = 0; - for (i = 0; i < 3; ++i) - vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) - A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; - } - - // Post-apply Householder transform - for (i = 0; i < 4; ++i) { - vDotA = 0; - for (j = 0; j < 3; ++j) - vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) - A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; - } -} - -/** - * @brief Perform one step of Francis QR algorithm - * Equivalent to two steps of the classical QR algorithm on a - * tridiagonal matrix. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param n Matrix dimension. - * @param shift1 QR algorithm shift. - * @param shift2 QR algorithm shift. - * @param alpha (Input/output, host memory, n entries) Diagonal - * entries of tridiagonal matrix. - * @param beta (Input/output, host memory, n-1 entries) - * Off-diagonal entries of tridiagonal matrix. - * @param V (Input/output, host memory, n*n entries) Orthonormal - * transforms from previous steps of QR algorithm. Matrix - * dimensions are n x n. On exit, the orthonormal transform from - * this Francis QR step is post-applied to the matrix. - * @param work (Output, host memory, 3*n entries) Workspace. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int francisQRIteration(index_type_t n, - value_type_t shift1, - value_type_t shift2, - value_type_t* alpha, - value_type_t* beta, - value_type_t* V, - value_type_t* work) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Temporary storage of 4x4 bulge and Householder vector - value_type_t bulge[16]; - - // Householder vector - value_type_t householder[3]; - // Householder matrix - value_type_t householderMatrix[3 * 3]; - - // Shifts are roots of the polynomial p(x)=x^2+b*x+c - value_type_t b = -shift1 - shift2; - value_type_t c = shift1 * shift2; - - // Loop indices - index_type_t i, j, pos; - // Temporary variable - value_type_t temp; - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Compute initial Householder transform - householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; - householder[1] = beta[0] * (alpha[0] + alpha[1] + b); - householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, householderMatrix); - - // Apply initial Householder transform to create bulge - memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) - bulge[IDX(i, i, 4)] = alpha[i]; - for (i = 0; i < 3; ++i) { - bulge[IDX(i + 1, i, 4)] = beta[i]; - bulge[IDX(i, i + 1, 4)] = beta[i]; - } - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); - memcpy(V, work, 3 * n * sizeof(value_type_t)); - - // Chase bulge to bottom-right of matrix with Householder transforms - for (pos = 0; pos < n - 4; ++pos) { - // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; - householder[0] = bulge[IDX(1, 0, 4)]; - householder[1] = bulge[IDX(2, 0, 4)]; - householder[2] = bulge[IDX(3, 0, 4)]; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - bulge[IDX(3, 0, 4)] = 0; - bulge[IDX(3, 1, 4)] = 0; - bulge[IDX(3, 2, 4)] = beta[pos + 3]; - bulge[IDX(0, 3, 4)] = 0; - bulge[IDX(1, 3, 4)] = 0; - bulge[IDX(2, 3, 4)] = beta[pos + 3]; - bulge[IDX(3, 3, 4)] = alpha[pos + 4]; - - // Apply Householder transform - findHouseholder3(householder, beta + pos, householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); - } - - // Apply penultimate Householder transform - // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; - householder[0] = bulge[IDX(1, 0, 4)]; - householder[1] = bulge[IDX(2, 0, 4)]; - householder[2] = bulge[IDX(3, 0, 4)]; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - bulge[IDX(3, 0, 4)] = 0; - bulge[IDX(3, 1, 4)] = 0; - bulge[IDX(3, 2, 4)] = 0; - bulge[IDX(0, 3, 4)] = 0; - bulge[IDX(1, 3, 4)] = 0; - bulge[IDX(2, 3, 4)] = 0; - bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); - - // Apply final Householder transform - // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; - householder[0] = bulge[IDX(1, 0, 4)]; - householder[1] = bulge[IDX(2, 0, 4)]; - householder[2] = 0; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); - - // Bulge has been eliminated - alpha[n - 2] = bulge[IDX(0, 0, 4)]; - alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; - - return 0; -} - -/** - * @brief Perform implicit restart of Lanczos algorithm - * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Matrix dimension. - * @param iter Current Lanczos iteration. - * @param iter_new Lanczos iteration after restart. - * @param shiftUpper Pointer (host memory) to upper bound for unwanted - * region. Value is ignored if less than *shiftLower. If a - * stronger upper bound has been found, the value is updated on - * exit. - * @param shiftLower Pointer (host memory) to lower bound for unwanted - * region. Value is ignored if greater than *shiftUpper. If a - * stronger lower bound has been found, the value is updated on - * exit. - * @param alpha_host (Input/output, host memory, iter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Input/output, host memory, iter entries) - * Off-diagonal entries of Lanczos system. - * @param V_host (Output, host memory, iter*iter entries) - * Orthonormal transform used to obtain restarted system. Matrix - * dimensions are iter x iter. - * @param work_host (Output, host memory, 4*iter entries) - * Workspace. - * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (iter+1). - * @param work_dev (Output, device memory, (n+iter)*iter entries) - * Workspace. - * @param smallest_eig specifies whether smallest (true) or largest - * (false) eigenvalues are to be calculated. - * @return error flag. - */ -template -static int lanczosRestart(handle_t const& handle, - index_type_t n, - index_type_t iter, - index_type_t iter_new, - value_type_t* shiftUpper, - value_type_t* shiftLower, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ V_host, - value_type_t* __restrict__ work_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - bool smallest_eig) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Useful constants - constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; - - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - // Loop index - index_type_t i; - - // Number of implicit restart steps - // Assumed to be even since each call to Francis algorithm is - // equivalent to two calls of QR algorithm - index_type_t restartSteps = iter - iter_new; - - // Ritz values from Lanczos method - value_type_t* ritzVals_host = work_host + 3 * iter; - // Shifts for implicit restart - value_type_t* shifts_host; - - // Orthonormal matrix for similarity transform - value_type_t* V_dev = work_dev + n * iter; - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Compute Ritz values - memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t)); - memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t)); - Lapack::sterf(iter, ritzVals_host, work_host); - - // Debug: Print largest eigenvalues - // for (int i = iter-iter_new; i < iter; ++i) - // std::cout <<*(ritzVals_host+i)<< " "; - // std::cout < *shiftUpper) { - *shiftUpper = ritzVals_host[iter - 1]; - *shiftLower = ritzVals_host[iter_new]; - } else { - *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]); - *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]); - } - } else { - if (*shiftLower > *shiftUpper) { - *shiftUpper = ritzVals_host[iter - iter_new - 1]; - *shiftLower = ritzVals_host[0]; - } else { - *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); - *shiftLower = std::min(*shiftLower, ritzVals_host[0]); - } - } - - // Calculate Chebyshev nodes as shifts - shifts_host = ritzVals_host; - for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); - shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); - shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); - } - - // Apply Francis QR algorithm to implicitly restart Lanczos - for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration( - iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) - WARNING("error in implicitly shifted QR algorithm"); - - // Obtain new residual - CUDA_TRY(cudaMemcpyAsync( - V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - RAFT_CUBLAS_TRY(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1, - stream)); - - // Obtain new Lanczos vectors - RAFT_CUBLAS_TRY(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - iter_new, - iter, - &one, - lanczosVecs_dev, - n, - V_dev, - iter, - &zero, - work_dev, - n, - stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, - work_dev, - n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - - // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), - lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - - RAFT_CUBLAS_TRY(cublasnrm2( - cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); - - auto h_beta = 1 / beta_host[iter_new - 1]; - RAFT_CUBLAS_TRY( - cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); - - return 0; -} - -} // namespace spectral - // ========================================================= // Eigensolver // ========================================================= -/** - * @brief Compute smallest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param effIter On exit, pointer to final size of Lanczos system. - * @param totalIter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param shift On exit, pointer to matrix shift (estimate for - * largest eigenvalue). - * @param alpha_host (Output, host memory, restartIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, restartIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (restartIter+1). - * @param work_dev (Output, device memory, - * (n+restartIter)*restartIter entries) Workspace. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. - */ -template -int computeSmallestEigenvectors(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t* effIter, - index_type_t* totalIter, - value_type_t* shift, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed) -{ - using namespace spectral; - - // Useful constants - constexpr value_type_t one = 1; - constexpr value_type_t zero = 0; - - // Matrix dimension - index_type_t n = A->nrows_; - - // Shift for implicit restart - value_type_t shiftUpper; - value_type_t shiftLower; - - // Lanczos iteration counters - index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system - - // Status flags - int status; - - // Loop index - index_type_t i; - - // Host memory - value_type_t* Z_host; // Eigenvectors in Lanczos basis - value_type_t* work_host; // Workspace - - // ------------------------------------------------------- - // Check that parameters are valid - // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECTS(tol > 0, "Invalid tolerance."); - RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); - - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - // ------------------------------------------------------- - // Variable initialization - // ------------------------------------------------------- - - // Total number of Lanczos iterations - *totalIter = 0; - - // Allocate host memory - std::vector Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); - - Z_host = Z_host_v.data(); - work_host = work_host_v.data(); - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // ------------------------------------------------------- - // Compute largest eigenvalue to determine shift - // ------------------------------------------------------- - - // Random number generator - curandGenerator_t randGen; - // Initialize random number generator - curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); - - curandSetPseudoRandomGeneratorSeed(randGen, seed); - - // Initialize initial Lanczos vector - curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); - value_type_t normQ1; - RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); - - auto h_val = 1 / normQ1; - RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); - - // Obtain tridiagonal matrix with Lanczos - *effIter = 0; - *shift = 0; - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0.0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - - // Determine largest eigenvalue - - Lapack::sterf(*effIter, alpha_host, beta_host); - *shift = -alpha_host[*effIter - 1]; - - // ------------------------------------------------------- - // Compute eigenvectors of shifted matrix - // ------------------------------------------------------- - - // Obtain tridiagonal matrix with Lanczos - *effIter = 0; - - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter; - - // Apply Lanczos method until convergence - shiftLower = 1; - shiftUpper = -1; - while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { - // Determine number of restart steps - // Number of steps must be even due to Francis algorithm - index_type_t iter_new = nEigVecs + 1; - if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) - iter_new = restartIter - (maxIter - *totalIter); - if ((restartIter - iter_new) % 2) iter_new -= 1; - if (iter_new == *effIter) break; - - // Implicit restart of Lanczos method - status = lanczosRestart(handle, - n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - true); - if (status) WARNING("error in Lanczos implicit restart"); - *effIter = iter_new; - - // Check for convergence - if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; - - // Proceed with Lanczos method - - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter - iter_new; - } - - // Warning if Lanczos has failed to converge - if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { - WARNING("implicitly restarted Lanczos failed to converge"); - } - - // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, - work_host); - - // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) - work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) - work_host[i + 2 * (*effIter)] = 0; - - // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, - work_host + 2 * (*effIter), - nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); - - CUDA_TRY(cudaMemcpyAsync(work_dev, - Z_host, - (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); - CHECK_CUDA(stream); - - // Convert eigenvectors from Lanczos basis to standard basis - RAFT_CUBLAS_TRY(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); - - // Clean up and exit - curandDestroyGenerator(randGen); - return 0; -} - /** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least @@ -1042,344 +74,17 @@ int computeSmallestEigenvectors(handle_t const& handle, value_type_t* __restrict__ eigVecs_dev, unsigned long long seed = 1234567) { - using namespace spectral; - - // Matrix dimension - index_type_t n = A.nrows_; - - // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECTS(tol > 0, "Invalid tolerance."); - RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); - - // Allocate memory - std::vector alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); - - value_type_t* alpha_host = alpha_host_v.data(); - value_type_t* beta_host = beta_host_v.data(); - - vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); - vector_t work_dev(handle, (n + restartIter) * restartIter); - - // Perform Lanczos method - index_type_t effIter; - value_type_t shift; - int status = computeSmallestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); - - // Clean up and return - return status; -} - -// ========================================================= -// Eigensolver -// ========================================================= - -/** - * @brief Compute largest eigenvectors of symmetric matrix - * Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the largest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th largest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param effIter On exit, pointer to final size of Lanczos system. - * @param totalIter On exit, pointer to total number of Lanczos - * iterations performed. - * @param alpha_host (Output, host memory, restartIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, restartIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (restartIter+1). - * @param work_dev (Output, device memory, - * (n+restartIter)*restartIter entries) Workspace. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to largest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @param seed random seed. - * @return error flag. - */ -template -int computeLargestEigenvectors(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t* effIter, - index_type_t* totalIter, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed) -{ - using namespace spectral; - - // Useful constants - constexpr value_type_t one = 1; - constexpr value_type_t zero = 0; - - // Matrix dimension - index_type_t n = A->nrows_; - - // Lanczos iteration counters - index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system - - // Status flags - int status; - - // Loop index - index_type_t i; - - // Host memory - value_type_t* Z_host; // Eigenvectors in Lanczos basis - value_type_t* work_host; // Workspace - - // ------------------------------------------------------- - // Check that LAPACK is enabled - // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); - - // ------------------------------------------------------- - // Check that parameters are valid - // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECTS(tol > 0, "Invalid tolerance."); - RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); - - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - // ------------------------------------------------------- - // Variable initialization - // ------------------------------------------------------- - - // Total number of Lanczos iterations - *totalIter = 0; - - // Allocate host memory - std::vector Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); - - Z_host = Z_host_v.data(); - work_host = work_host_v.data(); - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // ------------------------------------------------------- - // Compute largest eigenvalue - // ------------------------------------------------------- - - // Random number generator - curandGenerator_t randGen; - // Initialize random number generator - curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); - curandSetPseudoRandomGeneratorSeed(randGen, seed); - // Initialize initial Lanczos vector - curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); - value_type_t normQ1; - RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); - - auto h_val = 1 / normQ1; - RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); - - // Obtain tridiagonal matrix with Lanczos - *effIter = 0; - value_type_t shift_val = 0.0; - value_type_t* shift = &shift_val; - - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter; - - // Apply Lanczos method until convergence - value_type_t shiftLower = 1; - value_type_t shiftUpper = -1; - while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { - // Determine number of restart steps - // Number of steps must be even due to Francis algorithm - index_type_t iter_new = nEigVecs + 1; - if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) - iter_new = restartIter - (maxIter - *totalIter); - if ((restartIter - iter_new) % 2) iter_new -= 1; - if (iter_new == *effIter) break; - - // Implicit restart of Lanczos method - status = lanczosRestart(handle, - n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - false); - if (status) WARNING("error in Lanczos implicit restart"); - *effIter = iter_new; - - // Check for convergence - if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; - - // Proceed with Lanczos method - - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter - iter_new; - } - - // Warning if Lanczos has failed to converge - if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { - WARNING("implicitly restarted Lanczos failed to converge"); - } - for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) - Z_host[i * restartIter + j] = 0; - } - // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, - work_host); - - // note: We need to pick the top nEigVecs eigenvalues - // but effItter can be larger than nEigVecs - // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the - // matrix of size effIter. remember the array is sorted, so it is not needed for smallest - // eigenvalues case because the first ones are the smallest ones - - index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs; - - // Debug : print nEigVecs largest eigenvalues - // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) - // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; - // std::cout < 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECTS(tol > 0, "Invalid tolerance."); - RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); - - // Allocate memory - std::vector alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); - - value_type_t* alpha_host = alpha_host_v.data(); - value_type_t* beta_host = beta_host_v.data(); - - vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); - vector_t work_dev(handle, (n + restartIter) * restartIter); - - // Perform Lanczos method - index_type_t effIter; - int status = computeLargestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); - - // Clean up and return - return status; + return detail::computeLargestEigenvectors(handle, + A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + iter, + eigVals_dev, + eigVecs_dev, + seed); } } // namespace raft diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.hpp similarity index 61% rename from cpp/include/raft/linalg/map.cuh rename to cpp/include/raft/linalg/map.hpp index 4facc5e72c..febeaa8621 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,32 +16,11 @@ #pragma once -#include -#include -#include -#include +#include "detail/map.cuh" namespace raft { namespace linalg { -template -__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) -{ - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - - if (idx < len) { out[idx] = map(in[idx], args[idx]...); } -} - -template -void mapImpl( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ - const int nblks = raft::ceildiv(len, (size_t)TPB); - mapKernel - <<>>(out, len, map, in, args...); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - /** * @brief CUDA version of map * @tparam InType data-type upon which the math operation will be performed @@ -64,7 +43,7 @@ template void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) { - mapImpl(out, len, map, stream, in, args...); + detail::mapImpl(out, len, map, stream, in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp new file mode 100644 index 0000000000..04275995a0 --- /dev/null +++ b/cpp/include/raft/linalg/map_then_reduce.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/map_then_reduce.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief CUDA version of map and then sum reduction operation + * @tparam Type data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @param out the output sum-reduced value (assumed to be a device pointer) + * @param len number of elements in the input array + * @param map the device-lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays + */ + +template +void mapThenSumReduce( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + detail::mapThenReduceImpl( + out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...); +} + +/** + * @brief CUDA version of map and then generic reduction operation + * @tparam Type data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual map operation + * @tparam ReduceLambda the device-lambda performing the actual reduction + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @param out the output reduced value (assumed to be a device pointer) + * @param len number of elements in the input array + * @param neutral The neutral element of the reduction operation. For example: + * 0 for sum, 1 for multiply, +Inf for Min, -Inf for Max + * @param map the device-lambda + * @param op the reduction device lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays + */ + +template +void mapThenReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ + detail::mapThenReduceImpl( + out, len, neutral, map, op, stream, in, args...); +} +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.hpp similarity index 89% rename from cpp/include/raft/linalg/matrix_vector_op.cuh rename to cpp/include/raft/linalg/matrix_vector_op.hpp index 750eca0742..b9790ebce2 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include "detail/matrix_vector_op.cuh" namespace raft { namespace linalg { @@ -55,10 +55,7 @@ void matrixVectorOp(Type* out, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; - IdxType nLines = rowMajor ? N : D; - return matrix::linewiseOp( - out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec); + detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } /** @@ -97,10 +94,7 @@ void matrixVectorOp(Type* out, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; - IdxType nLines = rowMajor ? N : D; - return matrix::linewiseOp( - out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2); + detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.hpp similarity index 82% rename from cpp/include/raft/linalg/mean_squared_error.cuh rename to cpp/include/raft/linalg/mean_squared_error.hpp index a3fcc5bac6..42af8642b6 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include "map_then_reduce.cuh" +#include "detail/mean_squared_error.hpp" namespace raft { namespace linalg { @@ -36,11 +36,7 @@ template void meanSquaredError( math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) { - auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { - math_t diff = a - b; - return diff * diff * weight / len; - }; - mapThenSumReduce(out, len, sq_diff, stream, A, B); + detail::meanSquaredError(out, A, B, len, weight, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.hpp similarity index 88% rename from cpp/include/raft/linalg/multiply.cuh rename to cpp/include/raft/linalg/multiply.hpp index 53d57ecd00..4a1628b44a 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include "unary_op.cuh" +#include "detail/multiply.hpp" namespace raft { namespace linalg { @@ -35,8 +35,7 @@ namespace linalg { template void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) { - unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); + detail::multiplyScalar(out, in, scalar, len, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.hpp similarity index 65% rename from cpp/include/raft/linalg/norm.cuh rename to cpp/include/raft/linalg/norm.hpp index 82558c8023..a6336769ca 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,15 @@ #pragma once -#include "reduce.cuh" +#include "detail/norm.hpp" namespace raft { namespace linalg { /** different types of norms supported on the input buffers */ -enum NormType { L1Norm = 0, L2Norm }; +using detail::L1Norm; +using detail::L2Norm; +using detail::NormType; /** * @brief Compute row-wise norm of the input matrix and perform fin_op lambda @@ -54,37 +56,7 @@ void rowNorm(Type* dots, cudaStream_t stream, Lambda fin_op = raft::Nop()) { - switch (type) { - case L1Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); - break; - case L2Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); - break; - default: ASSERT(false, "Invalid norm type passed! [%d]", type); - }; + detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); } /** @@ -111,37 +83,7 @@ void colNorm(Type* dots, cudaStream_t stream, Lambda fin_op = raft::Nop()) { - switch (type) { - case L1Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); - break; - case L2Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); - break; - default: ASSERT(false, "Invalid norm type passed! [%d]", type); - }; + detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp new file mode 100644 index 0000000000..50e97e4069 --- /dev/null +++ b/cpp/include/raft/linalg/qr.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/qr.cuh" + +namespace raft { +namespace linalg { + +/** + * @defgroup QRdecomp QR decomposition + * @{ + */ + +/** + * @brief compute QR decomp and return only Q matrix + * @param handle: raft handle + * @param M: input matrix + * @param Q: Q matrix to be returned (on GPU) + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param stream cuda stream + * @{ + */ +template +void qrGetQ(const raft::handle_t& handle, + const math_t* M, + math_t* Q, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream); +} + +/** + * @brief compute QR decomp and return both Q and R matrices + * @param handle: raft handle + * @param M: input matrix + * @param Q: Q matrix to be returned (on GPU) + * @param R: R matrix to be returned (on GPU) + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param stream cuda stream + */ +template +void qrGetQR(const raft::handle_t& handle, + math_t* M, + math_t* Q, + math_t* R, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream); +} +/** @} */ + +}; // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.hpp similarity index 81% rename from cpp/include/raft/linalg/reduce.cuh rename to cpp/include/raft/linalg/reduce.hpp index 1f14f6eb31..1c4ef70df8 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,7 @@ #pragma once -#include "coalesced_reduction.cuh" -#include "strided_reduction.cuh" -#include +#include "detail/reduce.hpp" namespace raft { namespace linalg { @@ -71,15 +69,8 @@ void reduce(OutType* dots, ReduceLambda reduce_op = raft::Sum(), FinalLambda final_op = raft::Nop()) { - if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); - } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); - } + detail::reduce( + dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp new file mode 100644 index 0000000000..0f97323e5a --- /dev/null +++ b/cpp/include/raft/linalg/strided_reduction.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/strided_reduction.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief Compute reduction of the input matrix along the strided dimension + * + * @tparam InType the data type of the input + * @tparam OutType the data type of the output (as well as the data type for + * which reduction is performed) + * @tparam IdxType data type of the indices of the array + * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*MainLambda)(InType, IdxType);
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*ReduceLambda)(OutType);
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm) + * It must be a 'callable' supporting the following input and output: + *
OutType (*FinalLambda)(OutType);
+ * @param dots the output reduction vector + * @param data the input matrix + * @param D leading dimension of data + * @param N second dimension data + * @param init initial value to use for the reduction + * @param main_op elementwise operation to apply before reduction + * @param reduce_op binary reduction operation + * @param final_op elementwise operation to apply before storing results + * @param inplace reduction result added inplace or overwrites old values? + * @param stream cuda stream where to launch work + */ +template , + typename ReduceLambda = raft::Sum, + typename FinalLambda = raft::Nop> +void stridedReduction(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), + ReduceLambda reduce_op = raft::Sum(), + FinalLambda final_op = raft::Nop()) +{ + detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.hpp similarity index 68% rename from cpp/include/raft/linalg/subtract.cuh rename to cpp/include/raft/linalg/subtract.hpp index b33378bf33..9d48948cad 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,7 @@ #pragma once -#include "binary_op.cuh" -#include "unary_op.cuh" -#include +#include "detail/subtract.cuh" namespace raft { namespace linalg { @@ -40,8 +38,7 @@ namespace linalg { template void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) { - auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; - unaryOp(out, in, len, op, stream); + detail::subtractScalar(out, in, scalar, len, stream); } /** @@ -60,19 +57,7 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea template void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) { - auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; - binaryOp(out, in1, in2, len, op, stream); -} - -template -__global__ void subtract_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) -{ - // TODO: kernel do not use shared memory in current implementation - int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } + detail::subtract(out, in1, in2, len, stream); } /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and @@ -93,12 +78,7 @@ void subtractDevScalar(math_t* outDev, IdxType len, cudaStream_t stream) { - // Just for the note - there is no way to express such operation with cuBLAS in effective way - // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda - const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); - subtract_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp new file mode 100644 index 0000000000..a30180b174 --- /dev/null +++ b/cpp/include/raft/linalg/svd.hpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/svd.hpp" + +namespace raft { +namespace linalg { + +/** + * @brief singular value decomposition (SVD) on the column major float type + * input matrix using QR method + * @param handle: raft handle + * @param in: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param sing_vals: singular values of input matrix + * @param left_sing_vecs: left singular values of input matrix + * @param right_sing_vecs: right singular values of input matrix + * @param trans_right: transpose right vectors or not + * @param gen_left_vec: generate left eig vector. Not activated. + * @param gen_right_vec: generate right eig vector. Not activated. + * @param stream cuda stream + */ +// TODO: activate gen_left_vec and gen_right_vec options +// TODO: couldn't template this function due to cusolverDnSgesvd and +// cusolverSnSgesvd. Check if there is any other way. +template +void svdQR(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* sing_vals, + T* left_sing_vecs, + T* right_sing_vecs, + bool trans_right, + bool gen_left_vec, + bool gen_right_vec, + cudaStream_t stream) +{ + detail::svdQR(handle, + in, + n_rows, + n_cols, + sing_vals, + left_sing_vecs, + right_sing_vecs, + trans_right, + gen_left_vec, + gen_right_vec, + stream); +} + +template +void svdEig(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* S, + T* U, + T* V, + bool gen_left_vec, + cudaStream_t stream) +{ + detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream); +} + +/** + * @brief on the column major input matrix using Jacobi method + * @param handle: raft handle + * @param in: input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param sing_vals: singular values of input matrix + * @param left_sing_vecs: left singular vectors of input matrix + * @param right_sing_vecs: right singular vectors of input matrix + * @param gen_left_vec: generate left eig vector. Not activated. + * @param gen_right_vec: generate right eig vector. Not activated. + * @param tol: error tolerance for the jacobi method. Algorithm stops when the + * error is below tol + * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better + * accuracy. + * @param stream cuda stream + */ +template +void svdJacobi(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + math_t* sing_vals, + math_t* left_sing_vecs, + math_t* right_sing_vecs, + bool gen_left_vec, + bool gen_right_vec, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ + detail::svdJacobi(handle, + in, + n_rows, + n_cols, + sing_vals, + left_sing_vecs, + right_sing_vecs, + gen_left_vec, + gen_right_vec, + tol, + max_sweeps, + stream); +} + +/** + * @brief reconstruct a matrix use left and right singular vectors and + * singular values + * @param handle: raft handle + * @param U: left singular vectors of size n_rows x k + * @param S: square matrix with singular values on its diagonal, k x k + * @param V: right singular vectors of size n_cols x k + * @param out: reconstructed matrix to be returned + * @param n_rows: number rows of output matrix + * @param n_cols: number columns of output matrix + * @param k: number of singular values + * @param stream cuda stream + */ +template +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ + detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream); +} + +/** + * @brief reconstruct a matrix use left and right singular vectors and + * singular values + * @param handle: raft handle + * @param A_d: input matrix + * @param U: left singular vectors of size n_rows x k + * @param S_vec: singular values as a vector + * @param V: right singular vectors of size n_cols x k + * @param n_rows: number rows of output matrix + * @param n_cols: number columns of output matrix + * @param k: number of singular values to be computed, 1.0 for normal SVD + * @param tol: tolerance for the evaluation + * @param stream cuda stream + */ +template +bool evaluateSVDByL2Norm(const raft::handle_t& handle, + math_t* A_d, + math_t* U, + math_t* S_vec, + math_t* V, + int n_rows, + int n_cols, + int k, + math_t tol, + cudaStream_t stream) +{ + return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp new file mode 100644 index 0000000000..50608877fa --- /dev/null +++ b/cpp/include/raft/linalg/transpose.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/transpose.hpp" + +namespace raft { +namespace linalg { + +/** + * @brief transpose on the column major input matrix using Jacobi method + * @param handle: raft handle + * @param in: input matrix + * @param out: output. Transposed input matrix + * @param n_rows: number rows of input matrix + * @param n_cols: number columns of input matrix + * @param stream: cuda stream + */ +template +void transpose(const raft::handle_t& handle, + math_t* in, + math_t* out, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + detail::transpose(handle, in, out, n_rows, n_cols, stream); +} + +/** + * @brief transpose on the column major input matrix using Jacobi method + * @param inout: input and output matrix + * @param n: number of rows and columns of input matrix + * @param stream: cuda stream + */ +template +void transpose(math_t* inout, int n, cudaStream_t stream) +{ + detail::transpose(inout, n, stream); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp new file mode 100644 index 0000000000..51faa2e4a4 --- /dev/null +++ b/cpp/include/raft/linalg/unary_op.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/unary_op.cuh" + +namespace raft { +namespace linalg { + +/** + * @brief perform element-wise unary operation in the input array + * @tparam InType input data-type + * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType output data-type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in the input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val);` + */ +template +void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + detail::unaryOpCaller(out, in, len, op, stream); +} + +/** + * @brief Perform an element-wise unary operation into the output array + * + * Compared to `unaryOp()`, this method does not do any reads from any inputs + * + * @tparam OutType output data-type + * @tparam Lambda the device-lambda performing the actual operation + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * + * @param[out] out the output array [on device] [len = len] + * @param[in] len number of elements in the input array + * @param[in] op the device-lambda which must be of the form: + * `void func(OutType* outLocationOffset, IdxType idx);` + * where outLocationOffset will be out + idx. + * @param[in] stream cuda stream where to launch work + */ +template +void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ + detail::writeOnlyUnaryOpCaller(out, len, op, stream); +} + +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index 95103ab98e..6b32cbc06e 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -20,10 +20,10 @@ #include #include -#include -#include -#include -#include +#include +#include +#include +#include #include #include diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index f9cfffe64d..6d631b4f4f 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -29,7 +29,7 @@ #include #include #include -#include +#include namespace raft { namespace matrix { @@ -278,7 +278,8 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st { cublasHandle_t cublasH = handle.get_cublas_handle(); m_t normval = 0; - RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + // #TODO: Call from the public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 07bf251f14..f65f524f62 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index 00054a8e96..b1e9ae5a55 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 7f63a7fec8..57411f6998 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -20,8 +20,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index 1e907c98eb..4ceb31a3c8 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index 2f121dce33..7ec032d186 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 5d4640f4a6..bd96ca8649 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -20,10 +20,10 @@ #include #include -#include +#include #include -#include +#include #include #include #include diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h index 3c47d19a0b..0f4f50ceb6 100644 --- a/cpp/include/raft/sparse/op/detail/slice.h +++ b/cpp/include/raft/sparse/op/detail/slice.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh index 817b9782f2..b56b2df02e 100644 --- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index de0a15c029..3de10a2782 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -20,8 +20,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh index 1191251039..6ac96e1324 100644 --- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh @@ -24,7 +24,7 @@ #include -#include +#include #include #include diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp index ddfb74dedc..8b2747d104 100644 --- a/cpp/include/raft/sparse/selection/knn.hpp +++ b/cpp/include/raft/sparse/selection/knn.hpp @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp index 6d7cb826da..825761d44d 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.hpp +++ b/cpp/include/raft/sparse/selection/knn_graph.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 79f75dc8ae..339ca3687a 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp index a4bdb8f2de..5b93439218 100644 --- a/cpp/include/raft/spatial/knn/ball_cover.hpp +++ b/cpp/include/raft/spatial/knn/ball_cover.hpp @@ -21,7 +21,7 @@ #include "ball_cover_common.h" #include "detail/ball_cover.cuh" #include "detail/ball_cover/common.cuh" -#include +#include #include namespace raft { diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h index 2830b81cc0..e1a202107b 100644 --- a/cpp/include/raft/spatial/knn/ball_cover_common.h +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -17,8 +17,8 @@ #pragma once #include +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 1c2f21b72c..153b6b1d8a 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -43,7 +43,7 @@ #include -#include +#include #include diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 328ec0bf81..587505316b 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -20,7 +20,7 @@ #include #include -#include +#include namespace raft { namespace spatial { diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh index 385e16383e..6b5df01a97 100644 --- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -17,7 +17,7 @@ #include #include #include -#include +#include // TODO: Need to hide the PairwiseDistance class impl and expose to public API #include "processing.hpp" #include diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 8d40860535..06473f6151 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -25,8 +25,8 @@ #include #include +#include #include -#include #include namespace raft { diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 98a8885369..d5dfe4f8f8 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -30,8 +30,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index f87fffc6cf..a515ca8507 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -15,10 +15,10 @@ */ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #include #include #include diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index bcb28bfb1e..56f4022a8c 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include @@ -657,20 +657,21 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); @@ -852,7 +853,9 @@ int kmeans(handle_t const& handle, } // Initialize cuBLAS - RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 891c367d5e..d066c68a68 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -18,8 +18,8 @@ #include #include -#include -#include +#include +#include // for now; TODO: check if/where this `define` should be; // diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index a260e75505..75f0121795 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include @@ -349,7 +349,8 @@ struct laplacian_matrix_t : sparse_matrix_t { if (beta == 0) { CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); } else if (beta != 1) { - RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream)); } // Apply diagonal matrix @@ -412,7 +413,9 @@ struct modularity_matrix_t : laplacian_matrix_t { // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h, + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_h, n, laplacian_matrix_t::diagonal_.raw(), 1, @@ -424,7 +427,9 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h, + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasaxpy(cublas_h, n, &gamma_, laplacian_matrix_t::diagonal_.raw(), diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index c61b5f1458..8188a772b8 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -160,7 +160,9 @@ void analyzeModularity(handle_t const& handle, vector_t Bx(handle, n); // Initialize cuBLAS - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + // #TODO: Use public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, csr_m}; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index a30906de10..6b57566a73 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -132,7 +133,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::minus()); RAFT_CHECK_CUDA(stream); - RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -149,22 +152,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - RAFT_CUBLAS_TRY(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t*)NULL, - nEigVecs, - work.raw(), - nEigVecs, - stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); RAFT_CUDA_TRY(cudaMemcpyAsync( eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); @@ -216,14 +222,18 @@ bool construct_indicator(handle_t const& handle, RAFT_CHECK_CUDA(stream); // Compute size of ith partition - RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot( + cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 899e378d38..a512579c11 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh index 1a4fc20c51..db2eaf8459 100644 --- a/cpp/include/raft/stats/detail/mean_center.cuh +++ b/cpp/include/raft/stats/detail/mean_center.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include namespace raft { diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh index ed411ef74d..e3f586fea8 100644 --- a/cpp/include/raft/stats/detail/meanvar.cuh +++ b/cpp/include/raft/stats/detail/meanvar.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft::stats::detail { diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index 229eb34a7d..c07c212e54 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh index 1db504965c..ad46c3bf10 100644 --- a/cpp/include/raft/stats/detail/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 27908d8f15..176922529f 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index b54e69df36..c277db76ee 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -18,7 +18,7 @@ #include "add.cuh" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 70e4866407..1f1ff87a4d 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { namespace linalg { diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 2cf0679849..55810a5ca0 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -18,7 +18,7 @@ #include "binary_op.cuh" #include #include -#include +#include #include #include diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index c2ba8c18ee..b9ca9f8fd2 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -18,7 +18,7 @@ #include "../test_utils.h" #include -#include +#include namespace raft { namespace linalg { diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 9f07341c33..9f44cc8d5f 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -17,8 +17,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -42,7 +42,8 @@ class CholeskyR1Test : public ::testing::Test { // Allocate workspace solver_handle = handle.get_cusolver_dn_handle(); - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize( + // TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize( solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork)); int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace @@ -72,15 +73,16 @@ class CholeskyR1Test : public ::testing::Test { // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, handle.get_stream()); - RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle, - uplo, - rank, - L_exp.data(), - n_rows, - (math_t*)workspace.data(), - Lwork, - devInfo.data(), - handle.get_stream())); + // TODO: Call from public API when ready + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + handle.get_stream())); // Incremental Cholesky factorization using rank one updates. raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index f7bd0b60a4..1d1a4fd864 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 51173c07f0..f07a7d05ce 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -18,7 +18,7 @@ #include "unary_op.cuh" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 83bca6198d..9100a3a5f6 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b46efc38fd..4ae2653e47 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include namespace raft { @@ -83,15 +83,15 @@ class EigSelTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream); - eigSelDC(handle, - cov_matrix.data(), - params.n_row, - params.n_col, - 3, - eig_vectors.data(), - eig_vals.data(), - EigVecMemUsage::OVERWRITE_INPUT, - stream); + raft::linalg::eigSelDC(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + 3, + eig_vectors.data(), + eig_vals.data(), + EigVecMemUsage::OVERWRITE_INPUT, + stream); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 56e091837a..146d48e179 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index d539fe9a69..72567ff5f9 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu index f32d4cf809..ea84e06675 100644 --- a/cpp/test/linalg/gemv.cu +++ b/cpp/test/linalg/gemv.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 74b812a63b..d27fad4dfc 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -17,8 +17,8 @@ #include "../test_utils.h" #include #include -#include -#include +#include +#include #include namespace raft { diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 5b60b69d36..9875e2548f 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 531bf370d4..9ab005a075 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -16,7 +16,7 @@ #include "../test_utils.h" #include -#include +#include namespace raft { namespace linalg { diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 7bf797a4dc..ec0599eb1b 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -18,7 +18,7 @@ #include "unary_op.cuh" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 6415d4dca9..56e111d056 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index fa05397555..14f34f142d 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 5fb7ddbc7b..7840df2c0d 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -17,11 +17,9 @@ #pragma once #include - #include -#include -#include - +#include +#include #include #include @@ -69,7 +67,8 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t raft::linalg::unaryOp( ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - RAFT_CUBLAS_TRY(raft::linalg::cublasgemv( + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv( handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream)); RAFT_CUDA_TRY(cudaDeviceSynchronize()); RAFT_CUBLAS_TRY(cublasDestroy(handle)); diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index f2aa02d9d9..6d33fbdef1 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -18,7 +18,7 @@ #include "reduce.cuh" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index 9c62eeb9f1..7b82dab1ad 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index 47895cbc6a..e9128bad93 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index 6aa83fc074..60db1ee82b 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 1b132955f5..050fed78ea 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -18,7 +18,7 @@ #include "unary_op.cuh" #include #include -#include +#include #include namespace raft { diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index 8bb2d1e0be..b47e60d4f6 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -18,7 +18,7 @@ #include "../test_utils.h" #include -#include +#include namespace raft { namespace linalg { diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu index 1cd00b8adc..cd0d065ad4 100644 --- a/cpp/test/matrix/linewise_op.cu +++ b/cpp/test/matrix/linewise_op.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index c9b15737a1..648964fc57 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -26,8 +26,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index dc136d6f18..d3f4adb01b 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -19,8 +19,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index f4f346561c..c8798f832f 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index bcfa796931..f0336a31fa 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -18,7 +18,7 @@ #include #include "../test_utils.h" -#include +#include #include #include diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 81e6dc4768..cb09b9e7f5 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -17,8 +17,8 @@ #include "../test_utils.h" #include -#include -#include +#include +#include #include #include diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index 257950e4d7..66cd11be1f 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include "spatial_data.h" #include -#include +#include #include #include #if defined RAFT_NN_COMPILED diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu index a626824621..e635619897 100644 --- a/cpp/test/spatial/faiss_mr.cu +++ b/cpp/test/spatial/faiss_mr.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include -#include +#include #include #include diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index d30c018ed2..3254d41401 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index f60ec54bbc..6b7402a7bd 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include #include #include diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 8af1505bcd..ee216ee434 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -16,7 +16,7 @@ #include "../test_utils.h" -#include +#include #include #if defined RAFT_NN_COMPILED diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index 55e72656da..fd656423ad 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -17,7 +17,7 @@ #include "../test_utils.h" #include #include -#include +#include #include #include