linalg detail

rapidsai · rapids-bot · Feb 8, 2022 · Oct 21, 2021 · Nov 5, 2021 · Nov 17, 2021
commit b472870b53770411edab225f382dc0a658c8f734
@@ -17,7 +17,7 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft {
 namespace distance {

@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {

@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {

@@ -21,7 +21,7 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/contractions.hpp>
 
 namespace raft {
 namespace distance {

@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace distance {

@@ -16,8 +16,8 @@
 #pragma once
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/contractions.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/norm.hpp>
 #include <raft/vectorized.cuh>
 
 #include <cstddef>

@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 

@@ -19,8 +19,8 @@
 #include "detail/add.cuh"
 #include "detail/functional.cuh"
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
+#include "binary_op.hpp"
+#include "unary_op.hpp"
 
 namespace raft {
 namespace linalg {

@@ -43,35 +43,7 @@ template <typename InType, typename Lambda, typename OutType = InType,
           typename IdxType = int, int TPB = 256>
 void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
               Lambda op, cudaStream_t stream) {
-  constexpr auto maxSize =
-    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t in1Addr = uint64_t(in1);
-  uint64_t in2Addr = uint64_t(in2);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 &&
-      detail::addressAligned(in1Addr, in2Addr, outAddr, 16)) {
-        detail::binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 8)) {
-              detail::binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 4)) {
-              detail:: binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 2)) {
-              detail::binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (1 / maxSize) {
-    detail::binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else {
-    detail::binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
-                                                           op, stream);
-  }
+  detail::binaryOp(out, in1, in2, len, op, stream);
 }
 
 };  // end namespace linalg

@@ -57,7 +57,7 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N,
                         MainLambda main_op = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
                         FinalLambda final_op = raft::Nop<OutType>()) {
-  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op)
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 };  // end namespace linalg

@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/cuda_utils.cuh>
+
 namespace raft {
 namespace linalg {
 namespace detail {

@@ -61,6 +61,41 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
+template <typename InType, typename Lambda, typename OutType = InType,
+          typename IdxType = int, int TPB = 256>
+void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
+              Lambda op, cudaStream_t stream) {
+  constexpr auto maxSize =
+    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  uint64_t in1Addr = uint64_t(in1);
+  uint64_t in2Addr = uint64_t(in2);
+  uint64_t outAddr = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 &&
+      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+        binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 &&
+             addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+              binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 &&
+    addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+               binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 &&
+    addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+              binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (1 / maxSize) {
+    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else {
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
+                                                           op, stream);
+  }
+}
+
     } // namespace detail
 } // namespace linalg
 } // namespace raft
@@ -20,7 +20,7 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 
 namespace raft {
 namespace linalg {

@@ -21,7 +21,7 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -30,9 +30,9 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
-           int n_cols, math_t *eig_vectors, math_t *eig_vals,
-           cudaStream_t stream) {
+void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
+                  std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
+                  math_t *eig_vals, cudaStream_t stream) {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
@@ -51,10 +51,50 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
                                  d_dev_info.data(), stream));
   CUDA_CHECK(cudaGetLastError());
 
+  auto dev_info = d_dev_info.value(stream);
+  ASSERT(dev_info == 0,
+         "eig.cuh: eigensolver couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+}
+
+template <typename math_t>
+void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
+           std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals,
+           cudaStream_t stream) {
+#if CUDART_VERSION < 11010
+  eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+#else
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  cusolverDnParams_t dn_params = nullptr;
+  CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params));
+
+  size_t workspaceDevice = 0;
+  size_t workspaceHost = 0;
+  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(
+    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
+    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
+    eig_vals, &workspaceDevice, &workspaceHost, stream));
+
+  rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
+  rmm::device_scalar<int> d_dev_info(stream);
+  std::vector<math_t> h_work(workspaceHost / sizeof(math_t));
+
+  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
+
+  CUSOLVER_CHECK(cusolverDnxsyevd(
+    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
+    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
+    eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost,
+    d_dev_info.data(), stream));
+
+  CUDA_CHECK(cudaGetLastError());
+  CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params));
   int dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
+#endif
 }
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,25 +23,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
- * @param handle raft handle
- * @param a input matrix
- * @param n_rows_a number of rows of A
- * @param n_cols_a number of columns of A
- * @param b input matrix
- * @param c output matrix
- * @param n_rows_c number of rows of C
- * @param n_cols_c number of columns of C
- * @param trans_a cublas transpose op for A
- * @param trans_b cublas transpose op for B
- * @param alpha scalar
- * @param beta scalar
- * @param stream cuda stream
- */
 template <typename math_t>
 void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
           int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
@@ -59,36 +42,6 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
                           b, ldb, &beta, c, ldc, stream));
 }
 
-template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b,
-          cudaStream_t stream) {
-  math_t alpha = math_t(1);
-  math_t beta = math_t(0);
-  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
-       trans_b, alpha, beta, stream);
-}
-
-/**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
- * combinations of operand layouts.
- * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
- * @param handle raft handle
- * @param z output matrix of size M rows x N columns
- * @param x input matrix of size M rows x K columns
- * @param y input matrix of size K rows x N columns
- * @param _M number of rows of X and Z
- * @param _N number of rows of Y and columns of Z
- * @param _K number of columns of X and rows of Y
- * @param isZColMajor Storage layout of Z. true = col major, false = row major
- * @param isXColMajor Storage layout of X. true = col major, false = row major
- * @param isYColMajor Storage layout of Y. true = col major, false = row major
- * @param stream cuda stream
- * @param alpha scalar
- * @param beta scalar
- */
 template <typename T>
 void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
           int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
@@ -166,5 +119,6 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
                           b, ldb, &beta, c, ldc, stream));
 }
 
-}  // end namespace linalg
-}  // end namespace raft
+} // namespace detail
+} // namespace linalg
+} // namespace raft