From b48e32ccb334dbebf80f26fff8108c12ee7cf710 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 7 Jan 2022 10:21:42 -0800
Subject: [PATCH 1/3] public apis for all of matrix and stats

---
 cpp/include/raft/matrix/detail/math.cuh       | 339 ++++++++++++++++++
 cpp/include/raft/matrix/detail/matrix.cuh     | 109 ++++++
 cpp/include/raft/matrix/math.hpp              | 295 ++++++---------
 cpp/include/raft/matrix/matrix.hpp            |  79 +---
 cpp/include/raft/stats/detail/mean_center.cuh | 101 ++++++
 cpp/include/raft/stats/mean_center.hpp        |  26 +-
 6 files changed, 677 insertions(+), 272 deletions(-)
 create mode 100644 cpp/include/raft/stats/detail/mean_center.cuh
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 5e194b8dd4..95103ab98e 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -16,13 +16,352 @@
 
 #pragma once
 
+#include <raft/handle.hpp>
+
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace matrix {
 namespace detail {
 
+template <typename math_t>
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  auto d_src  = in;
+  auto d_dest = out;
+
+  raft::linalg::binaryOp(
+    d_dest,
+    d_src,
+    d_src,
+    len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
+    stream);
+}
+
+template <typename math_t>
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
+  power(inout, inout, scalar, len, stream);
+}
+
+template <typename math_t>
+void power(math_t* inout, int len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  power(inout, scalar, len, stream);
+}
+
+template <typename math_t>
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  power(in, out, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  auto d_src  = in;
+  auto d_dest = out;
+
+  raft::linalg::unaryOp(
+    d_dest,
+    d_src,
+    len,
+    [=] __device__(math_t a) {
+      if (set_neg_zero) {
+        if (a < math_t(0)) {
+          return math_t(0);
+        } else {
+          return sqrt(a * scalar);
+        }
+      } else {
+        return sqrt(a * scalar);
+      }
+    },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
+  seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  seqRoot(in, out, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  seqRoot(inout, inout, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  raft::linalg::unaryOp(
+    out,
+    in,
+    len,
+    [=] __device__(math_t a) {
+      if (a <= thres && -a <= thres) {
+        return math_t(0);
+      } else {
+        return a;
+      }
+    },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  setSmallValuesZero(inout, inout, len, stream, thres);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  auto d_src  = in;
+  auto d_dest = out;
+
+  raft::linalg::unaryOp(
+    d_dest,
+    d_src,
+    len,
+    [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  reciprocal(inout, inout, scalar, len, stream, setzero, thres);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  reciprocal(inout, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  reciprocal(in, out, scalar, len, stream);
+}
+
+template <typename math_t>
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  auto d_src  = src;
+  auto d_dest = dest;
+
+  rmm::device_scalar<math_t> d_sum(stream);
+  auto* d_sum_ptr = d_sum.data();
+  auto no_op      = [] __device__(math_t in) { return in; };
+  raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
+  raft::linalg::unaryOp(
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) {
+      if (b == Type(0))
+        return a;
+      else
+        return a * b;
+    },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
+  if (return_zero) {
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(Type a, Type b) {
+        if (raft::myAbs(b) < Type(1e-10))
+          return Type(0);
+        else
+          return a / b;
+      },
+      stream);
+  } else {
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(Type a, Type b) {
+        if (raft::myAbs(b) < Type(1e-10))
+          return a;
+        else
+          return a / b;
+      },
+      stream);
+  }
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
+}
+
 // Computes the argmax(d_in) column-wise in a DxN matrix
 template <typename T, int TPB>
 __global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index cf908c5e6d..f9cfffe64d 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -23,6 +23,14 @@
 
 #include <thrust/for_each.h>
 
+#include <algorithm>
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
+
 namespace raft {
 namespace matrix {
 namespace detail {
@@ -56,6 +64,98 @@ void copyRows(const m_t* in,
   });
 }
 
+template <typename m_t, typename idx_t = int>
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  auto m         = out_n_rows;
+  auto k         = in_n_rows;
+  idx_t size     = out_n_rows * out_n_cols;
+  auto d_q       = in;
+  auto d_q_trunc = out;
+  auto counting  = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
+    idx_t row                = idx % m;
+    idx_t col                = idx / m;
+    d_q_trunc[col * m + row] = d_q[col * k + row];
+  });
+}
+
+template <typename m_t, typename idx_t = int>
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto n            = n_cols;
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
+  auto d_q_reversed = inout;
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row             = idx % m;
+      idx_t dest_col             = idx / m;
+      idx_t src_row              = dest_row;
+      idx_t src_col              = (n - dest_col) - 1;
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
+}
+
+template <typename m_t, typename idx_t = int>
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
+  auto d_q_reversed = inout;
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row = idx % m;
+      idx_t dest_col = idx / m;
+      idx_t src_row  = (m - dest_row) - 1;
+      ;
+      idx_t src_col = dest_col;
+
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
+}
+
+template <typename m_t, typename idx_t = int>
+void print(const m_t* in,
+           idx_t n_rows,
+           idx_t n_cols,
+           char h_separator    = ' ',
+           char v_separator    = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default)
+{
+  std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
+  raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
+
+  for (idx_t i = 0; i < n_rows; i++) {
+    for (idx_t j = 0; j < n_cols; j++) {
+      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
+    }
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
+  for (idx_t i = 0; i < n_rows; i++) {
+    for (idx_t j = 0; j < n_cols; j++) {
+      printf("%1.4f ", in[j * n_rows + i]);
+    }
+    printf("\n");
+  }
+}
+
 /**
  * @brief Kernel for copying a slice of a big matrix to a small matrix with a
  * size matches that slice
@@ -173,6 +273,15 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
   matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
 }
 
+template <typename m_t, typename idx_t = int>
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+  m_t normval            = 0;
+  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  return normval;
+}
+
 }  // end namespace detail
 }  // end namespace matrix
 }  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index 8639cdfb02..be1428e655 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -18,14 +18,6 @@
 
 #include "detail/math.cuh"
 
-#include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
 namespace raft {
 namespace matrix {
 
@@ -45,16 +37,7 @@ namespace matrix {
 template <typename math_t>
 void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
 {
-  auto d_src  = in;
-  auto d_dest = out;
-
-  raft::linalg::binaryOp(
-    d_dest,
-    d_src,
-    d_src,
-    len,
-    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
-    stream);
+  detail::power(in, out, scalar, len, stream);
 }
 
 /**
@@ -67,7 +50,7 @@ void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
 template <typename math_t>
 void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
 {
-  power(inout, inout, scalar, len, stream);
+  detail::power(inout, scalar, len, stream);
 }
 
 /**
@@ -79,8 +62,7 @@ void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
 template <typename math_t>
 void power(math_t* inout, int len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  power(inout, scalar, len, stream);
+  detail::power(inout, len, stream);
 }
 
 /**
@@ -94,8 +76,7 @@ void power(math_t* inout, int len, cudaStream_t stream)
 template <typename math_t>
 void power(math_t* in, math_t* out, int len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  power(in, out, scalar, len, stream);
+  detail::power(in, out, len, stream);
 }
 
 /**
@@ -117,25 +98,7 @@ void seqRoot(math_t* in,
              cudaStream_t stream,
              bool set_neg_zero = false)
 {
-  auto d_src  = in;
-  auto d_dest = out;
-
-  raft::linalg::unaryOp(
-    d_dest,
-    d_src,
-    len,
-    [=] __device__(math_t a) {
-      if (set_neg_zero) {
-        if (a < math_t(0)) {
-          return math_t(0);
-        } else {
-          return sqrt(a * scalar);
-        }
-      } else {
-        return sqrt(a * scalar);
-      }
-    },
-    stream);
+  detail::seqRoot(in, out, scalar, len, stream, set_neg_zero);
 }
 
 /**
@@ -152,7 +115,7 @@ template <typename math_t, typename IdxType = int>
 void seqRoot(
   math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
 {
-  seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
+  detail::seqRoot(inout, scalar, len, stream, set_neg_zero);
 }
 
 /**
@@ -167,33 +130,38 @@ void seqRoot(
 template <typename math_t, typename IdxType = int>
 void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  seqRoot(in, out, scalar, len, stream);
+  detail::seqRoot(in, out, len, stream);
 }
 
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix with in-place results
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
 template <typename math_t, typename IdxType = int>
 void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  seqRoot(inout, inout, scalar, len, stream);
+  detail::seqRoot(inout, len, stream);
 }
 
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param in: input matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres threshold to set values to zero
+ */
 template <typename math_t, typename IdxType = int>
 void setSmallValuesZero(
   math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
 {
-  raft::linalg::unaryOp(
-    out,
-    in,
-    len,
-    [=] __device__(math_t a) {
-      if (a <= thres && -a <= thres) {
-        return math_t(0);
-      } else {
-        return a;
-      }
-    },
-    stream);
+  detail::setSmallValuesZero(out, in, len, stream, thres);
 }
 
 /**
@@ -208,7 +176,7 @@ void setSmallValuesZero(
 template <typename math_t, typename IdxType = int>
 void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
 {
-  setSmallValuesZero(inout, inout, len, stream, thres);
+  detail::setSmallValuesZero(inout, len, stream, thres);
 }
 
 /**
@@ -233,27 +201,20 @@ void reciprocal(math_t* in,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
-  auto d_src  = in;
-  auto d_dest = out;
-
-  raft::linalg::unaryOp(
-    d_dest,
-    d_src,
-    len,
-    [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; },
-    stream);
+  detail::reciprocal(in, out, scalar, len, stream, setzero, thres);
 }
 
 /**
  * @brief Reciprocal of every element in the input matrix
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
+ * @param inout: input matrix with in-place results
  * @param scalar: every element is multiplied with scalar
  * @param len: number elements of input matrix
  * @param stream cuda stream
- * @param setzero: (default false) when true and |value|<thres, avoid dividing by (almost) zero
- * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
  */
 template <typename math_t, typename IdxType = int>
 void reciprocal(math_t* inout,
@@ -263,7 +224,7 @@ void reciprocal(math_t* inout,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
-  reciprocal(inout, inout, scalar, len, stream, setzero, thres);
+  detail::reciprocal(inout, scalar, len, stream, setzero, thres);
 }
 
 /**
@@ -277,8 +238,7 @@ void reciprocal(math_t* inout,
 template <typename math_t, typename IdxType = int>
 void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  reciprocal(inout, scalar, len, stream);
+  detail::reciprocal(inout, len, stream);
 }
 
 /**
@@ -293,15 +253,22 @@ void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
 template <typename math_t, typename IdxType = int>
 void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  reciprocal(in, out, scalar, len, stream);
+  detail::reciprocal(in, out, len, stream);
 }
 
+/**
+ * @brief set values to scalar in matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @param out output matrix. The result is stored in the out matrix
+ * @param in input matrix
+ * @param scalar svalar value
+ * @param len number elements of input matrix
+ * @param stream cuda stream
+ */
 template <typename math_t>
 void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
+  detail::setValue(out, in, scalar, len, stream);
 }
 
 /**
@@ -318,15 +285,7 @@ template <typename math_t, typename IdxType = int>
 void ratio(
   const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
-  auto d_src  = src;
-  auto d_dest = dest;
-
-  rmm::device_scalar<math_t> d_sum(stream);
-  auto* d_sum_ptr = d_sum.data();
-  auto no_op      = [] __device__(math_t in) { return in; };
-  raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
-  raft::linalg::unaryOp(
-    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
+  detail::ratio(handle, src, dest, len, stream);
 }
 
 /** @} */
@@ -359,6 +318,16 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
   detail::signFlip(inout, n_rows, n_cols, stream);
 }
 
+/**
+ * @brief multiply each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryMult(Type* data,
                             const Type* vec,
@@ -368,18 +337,20 @@ void matrixVectorBinaryMult(Type* data,
                             bool bcastAlongRows,
                             cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; },
-    stream);
+  detail::matrixVectorBinaryMult<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryMultSkipZero(Type* data,
                                     const Type* vec,
@@ -389,23 +360,20 @@ void matrixVectorBinaryMultSkipZero(Type* data,
                                     bool bcastAlongRows,
                                     cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) {
-      if (b == Type(0))
-        return a;
-      else
-        return a * b;
-    },
-    stream);
+  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryDiv(Type* data,
                            const Type* vec,
@@ -415,18 +383,20 @@ void matrixVectorBinaryDiv(Type* data,
                            bool bcastAlongRows,
                            cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; },
-    stream);
+  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryDivSkipZero(Type* data,
                                    const Type* vec,
@@ -437,41 +407,20 @@ void matrixVectorBinaryDivSkipZero(Type* data,
                                    cudaStream_t stream,
                                    bool return_zero = false)
 {
-  if (return_zero) {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      vec,
-      n_col,
-      n_row,
-      rowMajor,
-      bcastAlongRows,
-      [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
-          return Type(0);
-        else
-          return a / b;
-      },
-      stream);
-  } else {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      vec,
-      n_col,
-      n_row,
-      rowMajor,
-      bcastAlongRows,
-      [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
-          return a;
-        else
-          return a / b;
-      },
-      stream);
-  }
+  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
 }
 
+/**
+ * @brief add each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryAdd(Type* data,
                            const Type* vec,
@@ -481,18 +430,20 @@ void matrixVectorBinaryAdd(Type* data,
                            bool bcastAlongRows,
                            cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinarySub(Type* data,
                            const Type* vec,
@@ -502,16 +453,8 @@ void matrixVectorBinarySub(Type* data,
                            bool bcastAlongRows,
                            cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index a89c28ab80..841a00cb4c 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -18,14 +18,6 @@
 
 #include "detail/matrix.cuh"
 
-#include <algorithm>
-#include <cstddef>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-
 namespace raft {
 namespace matrix {
 
@@ -87,18 +79,7 @@ template <typename m_t, typename idx_t = int>
 void truncZeroOrigin(
   m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
 {
-  auto m         = out_n_rows;
-  auto k         = in_n_rows;
-  idx_t size     = out_n_rows * out_n_cols;
-  auto d_q       = in;
-  auto d_q_trunc = out;
-  auto counting  = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
-    idx_t row                = idx % m;
-    idx_t col                = idx / m;
-    d_q_trunc[col * m + row] = d_q[col * k + row];
-  });
+  detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream);
 }
 
 /**
@@ -112,23 +93,7 @@ void truncZeroOrigin(
 template <typename m_t, typename idx_t = int>
 void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  auto n            = n_cols;
-  auto m            = n_rows;
-  idx_t size        = n_rows * n_cols;
-  auto d_q          = inout;
-  auto d_q_reversed = inout;
-  auto counting     = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row             = idx % m;
-      idx_t dest_col             = idx / m;
-      idx_t src_row              = dest_row;
-      idx_t src_col              = (n - dest_col) - 1;
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
-    });
+  detail::colReverse(inout, n_rows, n_cols, stream);
 }
 
 /**
@@ -142,24 +107,7 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  auto m            = n_rows;
-  idx_t size        = n_rows * n_cols;
-  auto d_q          = inout;
-  auto d_q_reversed = inout;
-  auto counting     = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row = idx % m;
-      idx_t dest_col = idx / m;
-      idx_t src_row  = (m - dest_row) - 1;
-      ;
-      idx_t src_col = dest_col;
-
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
-    });
+  detail::rowReverse(inout, n_rows, n_cols, stream);
 }
 
 /**
@@ -179,14 +127,7 @@ void print(const m_t* in,
            char v_separator    = '\n',
            cudaStream_t stream = rmm::cuda_stream_default)
 {
-  std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
-  raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
-
-  for (idx_t i = 0; i < n_rows; i++) {
-    for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
-    }
-  }
+  detail::print(in, n_rows, n_cols, h_separator, v_separator, stream);
 }
 
 /**
@@ -198,12 +139,7 @@ void print(const m_t* in,
 template <typename m_t, typename idx_t = int>
 void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
 {
-  for (idx_t i = 0; i < n_rows; i++) {
-    for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f ", in[j * n_rows + i]);
-    }
-    printf("\n");
-  }
+  detail::printHost(in, n_rows, n_cols);
 }
 
 /**
@@ -284,10 +220,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
 {
-  cublasHandle_t cublasH = handle.get_cublas_handle();
-  m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
-  return normval;
+  detail::getL2Norm(handle, in, size, stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
new file mode 100644
index 0000000000..1a4fc20c51
--- /dev/null
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Center the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-centered matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
+}
+
+/**
+ * @brief Add the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-added matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index c0ba24312b..406a0b5047 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/vectorized.cuh>
+#include "detail/mean_center.cuh"
 
 namespace raft {
 namespace stats {
@@ -47,16 +45,7 @@ void meanCenter(Type* out,
                 bool bcastAlongRows,
                 cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+  detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
 }
 
 /**
@@ -83,16 +72,7 @@ void meanAdd(Type* out,
              bool bcastAlongRows,
              cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
 }
 
 };  // end namespace stats

From 3224cc1d915b6a3557c702e8ae3c9dea45e9293a Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 10 Jan 2022 17:34:44 -0800
Subject: [PATCH 2/3] correcting docs

---
 cpp/include/raft/matrix/math.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index be1428e655..619e20a702 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -396,6 +396,8 @@ void matrixVectorBinaryDiv(Type* data,
  * @param rowMajor whether matrix is row major
  * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
  * @param stream cuda stream
+ * @param return_zero result is zero if true and vector value is below threshold, original value if
+ * false
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryDivSkipZero(Type* data,

From bb652e7325e184ebcfbdfd7740a276115e5eac83 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 11 Jan 2022 10:50:23 -0800
Subject: [PATCH 3/3] add missing return type

---
 cpp/include/raft/matrix/matrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 841a00cb4c..d3d98cb872 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -220,7 +220,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
 {
-  detail::getL2Norm(handle, in, size, stream);
+  return detail::getL2Norm(handle, in, size, stream);
 }
 
 };  // end namespace matrix