diff --git a/CHANGELOG.md b/CHANGELOG.md
index f7ea434693..b13ec307c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # RAFT 0.17.0 (Date TBD)
 
 ## New Features
+- PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects
 
 ## Improvements
 - PR #73: Move DistanceType enum from cuML to RAFT
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f3de222928..cbe96454f5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -235,8 +235,34 @@ if(BUILD_RAFT_TESTS)
     test/cudart_utils.cpp
     test/handle.cpp
     test/integer_utils.cpp
+    test/linalg/add.cu
+    test/linalg/binary_op.cu
+    test/linalg/coalesced_reduction.cu
+    test/linalg/divide.cu
+    test/linalg/eig.cu
+    test/linalg/eig_sel.cu
+    test/linalg/gemm_layout.cu
+    test/linalg/map_then_reduce.cu
+    test/linalg/matrix_vector_op.cu
+    test/linalg/multiply.cu
+    test/linalg/norm.cu
+    test/linalg/reduce.cu
+    test/linalg/strided_reduction.cu
+    test/linalg/subtract.cu
+    test/linalg/svd.cu
+    test/linalg/transpose.cu
+    test/linalg/unary_op.cu
+    test/matrix/math.cu
+    test/matrix/matrix.cu
     test/mr/device/buffer.cpp
     test/mr/host/buffer.cpp
+    test/random/rng.cu
+    test/random/rng_int.cu
+    test/random/sample_without_replacement.cu
+    test/stats/mean.cu
+    test/stats/mean_center.cu
+    test/stats/stddev.cu
+    test/stats/sum.cu
     test/test.cpp
     test/spectral_matrix.cu
     test/eigen_solvers.cu
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
new file mode 100644
index 0000000000..8d5b29f700
--- /dev/null
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+
+/**
+     * @brief Convenience wrapper over cub's SortPairs method
+     * @tparam KeyT key type
+     * @tparam ValueT value type
+     * @param workspace workspace buffer which will get resized if not enough space
+     * @param inKeys input keys array
+     * @param outKeys output keys array
+     * @param inVals input values array
+     * @param outVals output values array
+     * @param len array length
+     * @param stream cuda stream
+     */
+template <typename KeyT, typename ValueT>
+void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys,
+               KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
+               cudaStream_t stream) {
+  size_t worksize;
+  cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals,
+                                  outVals, len, 0, sizeof(KeyT) * 8, stream);
+  workspace.resize(worksize, stream);
+  cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys,
+                                  inVals, outVals, len, 0, sizeof(KeyT) * 8,
+                                  stream);
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
new file mode 100644
index 0000000000..785794461e
--- /dev/null
+++ b/cpp/include/raft/common/scatter.cuh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT>
+__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
+                              IdxT len, Lambda op) {
+  typedef TxN_t<DataT, VecLen> DataVec;
+  typedef TxN_t<IdxT, VecLen> IdxVec;
+  IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
+  tid *= VecLen;
+  if (tid >= len) return;
+  IdxVec idxIn;
+  idxIn.load(idx, tid);
+  DataVec dataIn;
+#pragma unroll
+  for (int i = 0; i < VecLen; ++i) {
+    auto inPos = idxIn.val.data[i];
+    dataIn.val.data[i] = op(in[inPos], tid + i);
+  }
+  dataIn.store(out, tid);
+}
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
+void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
+                 Lambda op, cudaStream_t stream) {
+  const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
+  scatterKernel<DataT, VecLen, Lambda, IdxT>
+    <<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+     * @brief Performs scatter operation based on the input indexing array
+     * @tparam DataT data type whose array gets scattered
+     * @tparam IdxT indexing type
+     * @tparam TPB threads-per-block in the final kernel launched
+     * @tparam Lambda the device-lambda performing a unary operation on the loaded
+     * data before it gets scattered
+     * @param out the output array
+     * @param in the input array
+     * @param idx the indexing array
+     * @param len number of elements in the input array
+     * @param stream cuda stream where to launch work
+     * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
+     * will be applied to every element before scattering it to the right location.
+     * The second param in this method will be the destination index.
+     */
+template <typename DataT, typename IdxT,
+          typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
+             cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) {
+  if (len <= 0) return;
+  constexpr size_t DataSize = sizeof(DataT);
+  constexpr size_t IdxSize = sizeof(IdxT);
+  constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
+  size_t bytes = len * MaxPerElem;
+  if (16 / MaxPerElem && bytes % 16 == 0) {
+    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len,
+                                                           op, stream);
+  } else if (8 / MaxPerElem && bytes % 8 == 0) {
+    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else if (4 / MaxPerElem && bytes % 4 == 0) {
+    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else if (2 / MaxPerElem && bytes % 2 == 0) {
+    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else if (1 / MaxPerElem) {
+    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else {
+    scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+  }
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
new file mode 100644
index 0000000000..696b3ec662
--- /dev/null
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <math_constants.h>
+#include <stdint.h>
+
+#ifndef ENABLE_MEMCPY_ASYNC
+// enable memcpy_async interface by default for newer GPUs
+#if __CUDA_ARCH__ >= 800
+#define ENABLE_MEMCPY_ASYNC 1
+#endif
+#else  // ENABLE_MEMCPY_ASYNC
+// disable memcpy_async for all older GPUs
+#if __CUDA_ARCH__ < 800
+#define ENABLE_MEMCPY_ASYNC 0
+#endif
+#endif  // ENABLE_MEMCPY_ASYNC
+
+namespace raft {
+
+/** helper macro for device inlined functions */
+#define DI inline __device__
+#define HDI inline __host__ __device__
+#define HD __host__ __device__
+
+/**
+ * @brief Provide a ceiling division operation ie. ceil(a / b)
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr HDI IntType ceildiv(IntType a, IntType b) {
+  return (a + b - 1) / b;
+}
+
+/**
+ * @brief Provide an alignment function ie. ceil(a / b) * b
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr HDI IntType alignTo(IntType a, IntType b) {
+  return ceildiv(a, b) * b;
+}
+
+/**
+ * @brief Provide an alignment function ie. (a / b) * b
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr HDI IntType alignDown(IntType a, IntType b) {
+  return (a / b) * b;
+}
+
+/**
+ * @brief Check if the input is a power of 2
+ * @tparam IntType data type (checked only for integers)
+ */
+template <typename IntType>
+constexpr HDI bool isPo2(IntType num) {
+  return (num && !(num & (num - 1)));
+}
+
+/**
+ * @brief Give logarithm of the number to base-2
+ * @tparam IntType data type (checked only for integers)
+ */
+template <typename IntType>
+constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) {
+  return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
+}
+
+/** Device function to apply the input lambda across threads in the grid */
+template <int ItemsPerThread, typename L>
+DI void forEach(int num, L lambda) {
+  int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+  const int numThreads = blockDim.x * gridDim.x;
+#pragma unroll
+  for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
+    if (idx < num) lambda(idx, itr);
+  }
+}
+
+/** number of threads per warp */
+static const int WarpSize = 32;
+
+/** get the laneId of the current thread */
+DI int laneId() {
+  int id;
+  asm("mov.s32 %0, %laneid;" : "=r"(id));
+  return id;
+}
+
+/**
+ * @brief Swap two values
+ * @tparam T the datatype of the values
+ * @param a first input
+ * @param b second input
+ */
+template <typename T>
+HDI void swapVals(T &a, T &b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+/** Device function to have atomic add support for older archs */
+template <typename Type>
+DI void myAtomicAdd(Type *address, Type val) {
+  atomicAdd(address, val);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
+// Ref:
+// http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf
+template <>
+DI void myAtomicAdd(double *address, double val) {
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+}
+#endif
+
+template <typename T, typename ReduceLambda>
+DI void myAtomicReduce(T *address, T val, ReduceLambda op);
+
+template <typename ReduceLambda>
+DI void myAtomicReduce(double *address, double val, ReduceLambda op) {
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old =
+      atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(op(val, __longlong_as_double(assumed))));
+  } while (assumed != old);
+}
+
+template <typename ReduceLambda>
+DI void myAtomicReduce(float *address, float val, ReduceLambda op) {
+  unsigned int *address_as_uint = (unsigned int *)address;
+  unsigned int old = *address_as_uint, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_uint, assumed,
+                    __float_as_uint(op(val, __uint_as_float(assumed))));
+  } while (assumed != old);
+}
+
+template <typename ReduceLambda>
+DI void myAtomicReduce(int *address, int val, ReduceLambda op) {
+  int old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, op(val, assumed));
+  } while (assumed != old);
+}
+
+template <typename ReduceLambda>
+DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) {
+  long long old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, op(val, assumed));
+  } while (assumed != old);
+}
+
+template <typename ReduceLambda>
+DI void myAtomicReduce(unsigned long long *address, unsigned long long val,
+                       ReduceLambda op) {
+  unsigned long long old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, op(val, assumed));
+  } while (assumed != old);
+}
+
+/**
+ * @brief Provide atomic min operation.
+ * @tparam T: data type for input data (float or double).
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] val: new value to compare with old
+ */
+template <typename T>
+DI T myAtomicMin(T *address, T val);
+
+/**
+ * @brief Provide atomic max operation.
+ * @tparam T: data type for input data (float or double).
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] val: new value to compare with old
+ */
+template <typename T>
+DI T myAtomicMax(T *address, T val);
+
+DI float myAtomicMin(float *address, float val) {
+  myAtomicReduce(address, val, fminf);
+  return *address;
+}
+
+DI float myAtomicMax(float *address, float val) {
+  myAtomicReduce(address, val, fmaxf);
+  return *address;
+}
+
+DI double myAtomicMin(double *address, double val) {
+  myAtomicReduce<double(double, double)>(address, val, fmin);
+  return *address;
+}
+
+DI double myAtomicMax(double *address, double val) {
+  myAtomicReduce<double(double, double)>(address, val, fmax);
+  return *address;
+}
+
+/**
+ * @defgroup Max maximum of two numbers
+ * @{
+ */
+template <typename T>
+HDI T myMax(T x, T y);
+template <>
+HDI float myMax<float>(float x, float y) {
+  return fmaxf(x, y);
+}
+template <>
+HDI double myMax<double>(double x, double y) {
+  return fmax(x, y);
+}
+/** @} */
+
+/**
+ * @defgroup Min minimum of two numbers
+ * @{
+ */
+template <typename T>
+HDI T myMin(T x, T y);
+template <>
+HDI float myMin<float>(float x, float y) {
+  return fminf(x, y);
+}
+template <>
+HDI double myMin<double>(double x, double y) {
+  return fmin(x, y);
+}
+/** @} */
+
+/**
+ * @brief Provide atomic min operation.
+ * @tparam T: data type for input data (float or double).
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] val: new value to compare with old
+ */
+template <typename T>
+DI T myAtomicMin(T *address, T val) {
+  myAtomicReduce(address, val, myMin<T>);
+  return *address;
+}
+
+/**
+ * @brief Provide atomic max operation.
+ * @tparam T: data type for input data (float or double).
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] val: new value to compare with old
+ */
+template <typename T>
+DI T myAtomicMax(T *address, T val) {
+  myAtomicReduce(address, val, myMax<T>);
+  return *address;
+}
+
+/**
+ * Sign function
+ */
+template <typename T>
+HDI int sgn(const T val) {
+  return (T(0) < val) - (val < T(0));
+}
+
+/**
+ * @defgroup Exp Exponential function
+ * @{
+ */
+template <typename T>
+HDI T myExp(T x);
+template <>
+HDI float myExp(float x) {
+  return expf(x);
+}
+template <>
+HDI double myExp(double x) {
+  return exp(x);
+}
+/** @} */
+
+/**
+ * @defgroup Cuda infinity values
+ * @{
+ */
+template <typename T>
+inline __device__ T myInf();
+template <>
+inline __device__ float myInf<float>() {
+  return CUDART_INF_F;
+}
+template <>
+inline __device__ double myInf<double>() {
+  return CUDART_INF;
+}
+/** @} */
+
+/**
+ * @defgroup Log Natural logarithm
+ * @{
+ */
+template <typename T>
+HDI T myLog(T x);
+template <>
+HDI float myLog(float x) {
+  return logf(x);
+}
+template <>
+HDI double myLog(double x) {
+  return log(x);
+}
+/** @} */
+
+/**
+ * @defgroup Sqrt Square root
+ * @{
+ */
+template <typename T>
+HDI T mySqrt(T x);
+template <>
+HDI float mySqrt(float x) {
+  return sqrtf(x);
+}
+template <>
+HDI double mySqrt(double x) {
+  return sqrt(x);
+}
+/** @} */
+
+/**
+ * @defgroup SineCosine Sine and cosine calculation
+ * @{
+ */
+template <typename T>
+DI void mySinCos(T x, T &s, T &c);
+template <>
+DI void mySinCos(float x, float &s, float &c) {
+  sincosf(x, &s, &c);
+}
+template <>
+DI void mySinCos(double x, double &s, double &c) {
+  sincos(x, &s, &c);
+}
+/** @} */
+
+/**
+ * @defgroup Sine Sine calculation
+ * @{
+ */
+template <typename T>
+DI T mySin(T x);
+template <>
+DI float mySin(float x) {
+  return sinf(x);
+}
+template <>
+DI double mySin(double x) {
+  return sin(x);
+}
+/** @} */
+
+/**
+ * @defgroup Abs Absolute value
+ * @{
+ */
+template <typename T>
+DI T myAbs(T x) {
+  return x < 0 ? -x : x;
+}
+template <>
+DI float myAbs(float x) {
+  return fabsf(x);
+}
+template <>
+DI double myAbs(double x) {
+  return fabs(x);
+}
+/** @} */
+
+/**
+ * @defgroup Pow Power function
+ * @{
+ */
+template <typename T>
+HDI T myPow(T x, T power);
+template <>
+HDI float myPow(float x, float power) {
+  return powf(x, power);
+}
+template <>
+HDI double myPow(double x, double power) {
+  return pow(x, power);
+}
+/** @} */
+
+/**
+ * @defgroup myTanh tanh function
+ * @{
+ */
+template <typename T>
+HDI T myTanh(T x);
+template <>
+HDI float myTanh(float x) {
+  return tanhf(x);
+}
+template <>
+HDI double myTanh(double x) {
+  return tanh(x);
+}
+/** @} */
+
+/**
+ * @defgroup myATanh arctanh function
+ * @{
+ */
+template <typename T>
+HDI T myATanh(T x);
+template <>
+HDI float myATanh(float x) {
+  return atanhf(x);
+}
+template <>
+HDI double myATanh(double x) {
+  return atanh(x);
+}
+/** @} */
+
+/**
+ * @defgroup LambdaOps Lambda operations in reduction kernels
+ * @{
+ */
+// IdxType mostly to be used for MainLambda in *Reduction kernels
+template <typename Type, typename IdxType = int>
+struct Nop {
+  HDI Type operator()(Type in, IdxType i = 0) { return in; }
+};
+
+template <typename Type, typename IdxType = int>
+struct L1Op {
+  HDI Type operator()(Type in, IdxType i = 0) { return myAbs(in); }
+};
+
+template <typename Type, typename IdxType = int>
+struct L2Op {
+  HDI Type operator()(Type in, IdxType i = 0) { return in * in; }
+};
+
+template <typename Type>
+struct Sum {
+  HDI Type operator()(Type a, Type b) { return a + b; }
+};
+/** @} */
+
+/**
+ * @defgroup Sign Obtain sign value
+ * @brief Obtain sign of x
+ * @param x input
+ * @return +1 if x >= 0 and -1 otherwise
+ * @{
+ */
+template <typename T>
+DI T signPrim(T x) {
+  return x < 0 ? -1 : +1;
+}
+template <>
+DI float signPrim(float x) {
+  return signbit(x) == true ? -1.0f : +1.0f;
+}
+template <>
+DI double signPrim(double x) {
+  return signbit(x) == true ? -1.0 : +1.0;
+}
+/** @} */
+
+/**
+ * @defgroup Max maximum of two numbers
+ * @brief Obtain maximum of two values
+ * @param x one item
+ * @param y second item
+ * @return maximum of two items
+ * @{
+ */
+template <typename T>
+DI T maxPrim(T x, T y) {
+  return x > y ? x : y;
+}
+template <>
+DI float maxPrim(float x, float y) {
+  return fmaxf(x, y);
+}
+template <>
+DI double maxPrim(double x, double y) {
+  return fmax(x, y);
+}
+/** @} */
+
+/** apply a warp-wide fence (useful from Volta+ archs) */
+DI void warpFence() {
+#if __CUDA_ARCH__ >= 700
+  __syncwarp();
+#endif
+}
+
+/** warp-wide any boolean aggregator */
+DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  inFlag = __any_sync(mask, inFlag);
+#else
+  inFlag = __any(inFlag);
+#endif
+  return inFlag;
+}
+
+/** warp-wide all boolean aggregator */
+DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  inFlag = __all_sync(mask, inFlag);
+#else
+  inFlag = __all(inFlag);
+#endif
+  return inFlag;
+}
+
+/**
+ * @brief Shuffle the data inside a warp
+ * @tparam T the data type (currently assumed to be 4B)
+ * @param val value to be shuffled
+ * @param srcLane lane from where to shuffle
+ * @param width lane width
+ * @param mask mask of participating threads (Volta+)
+ * @return the shuffled data
+ */
+template <typename T>
+DI T shfl(T val, int srcLane, int width = WarpSize,
+          uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  return __shfl_sync(mask, val, srcLane, width);
+#else
+  return __shfl(val, srcLane, width);
+#endif
+}
+
+/**
+ * @brief Shuffle the data inside a warp
+ * @tparam T the data type (currently assumed to be 4B)
+ * @param val value to be shuffled
+ * @param laneMask mask to be applied in order to perform xor shuffle
+ * @param width lane width
+ * @param mask mask of participating threads (Volta+)
+ * @return the shuffled data
+ */
+template <typename T>
+DI T shfl_xor(T val, int laneMask, int width = WarpSize,
+              uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, val, laneMask, width);
+#else
+  return __shfl_xor(val, laneMask, width);
+#endif
+}
+
+/**
+ * @brief Warp-level sum reduction
+ * @param val input value
+ * @return only the lane0 will contain valid reduced result
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block. All threads in the warp must enter this
+ *       function together
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T>
+DI T warpReduce(T val) {
+#pragma unroll
+  for (int i = WarpSize / 2; i > 0; i >>= 1) {
+    T tmp = shfl(val, laneId() + i);
+    val += tmp;
+  }
+  return val;
+}
+
+/**
+ * @brief 1-D block-level sum reduction
+ * @param val input value
+ * @param smem shared memory region needed for storing intermediate results. It
+ *             must alteast be of size: `sizeof(T) * nWarps`
+ * @return only the thread0 will contain valid reduced result
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block. All threads in the block must enter this
+ *       function together
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T>
+DI T blockReduce(T val, char *smem) {
+  auto *sTemp = reinterpret_cast<T *>(smem);
+  int nWarps = (blockDim.x + WarpSize - 1) / WarpSize;
+  int lid = laneId();
+  int wid = threadIdx.x / WarpSize;
+  val = warpReduce(val);
+  if (lid == 0) sTemp[wid] = val;
+  __syncthreads();
+  val = lid < nWarps ? sTemp[lid] : T(0);
+  return warpReduce(val);
+}
+
+/**
+ * @brief Simple utility function to determine whether user_stream or one of the
+ * internal streams should be used.
+ * @param user_stream main user stream
+ * @param int_streams array of internal streams
+ * @param n_int_streams number of internal streams
+ * @param idx the index for which to query the stream
+ */
+inline cudaStream_t select_stream(cudaStream_t user_stream,
+                                  cudaStream_t *int_streams, int n_int_streams,
+                                  int idx) {
+  return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index b4713b9d53..86c60addf2 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -21,7 +21,10 @@
 #include <cuda_runtime.h>
 
 #include <execinfo.h>
+#include <chrono>
 #include <cstdio>
+#include <iomanip>
+#include <iostream>
 
 ///@todo: enable once logging has been enabled in raft
 //#include "logger.hpp"
@@ -256,4 +259,107 @@ void print_device_vector(const char* variable_name, const T* devMem,
 }
 /** @} */
 
+/** cuda malloc */
+template <typename Type>
+void allocate(Type*& ptr, size_t len, bool setZero = false) {
+  CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len));
+  if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len));
+}
+
+/** helper method to get max usable shared mem per block parameter */
+inline int getSharedMemPerBlock() {
+  int devId;
+  CUDA_CHECK(cudaGetDevice(&devId));
+  int smemPerBlk;
+  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk,
+                                    cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  return smemPerBlk;
+}
+
+/** helper method to get multi-processor count parameter */
+inline int getMultiProcessorCount() {
+  int devId;
+  CUDA_CHECK(cudaGetDevice(&devId));
+  int mpCount;
+  CUDA_CHECK(
+    cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  return mpCount;
+}
+
+/** helper method to convert an array on device to a string on host */
+template <typename T>
+std::string arr2Str(const T* arr, int size, std::string name,
+                    cudaStream_t stream, int width = 4) {
+  std::stringstream ss;
+
+  T* arr_h = (T*)malloc(size * sizeof(T));
+  update_host(arr_h, arr, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  ss << name << " = [ ";
+  for (int i = 0; i < size; i++) {
+    ss << std::setw(width) << arr_h[i];
+
+    if (i < size - 1) ss << ", ";
+  }
+  ss << " ]" << std::endl;
+
+  free(arr_h);
+
+  return ss.str();
+}
+
+/** this seems to be unused, but may be useful in the future */
+template <typename T>
+void ASSERT_DEVICE_MEM(T* ptr, std::string name) {
+  cudaPointerAttributes s_att;
+  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+  if (s_err != 0 || s_att.device == -1)
+    std::cout << "Invalid device pointer encountered in " << name
+              << ". device=" << s_att.device << ", err=" << s_err << std::endl;
+}
+
+inline uint32_t curTimeMillis() {
+  auto now = std::chrono::high_resolution_clock::now();
+  auto duration = now.time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+    .count();
+}
+
+/** Helper function to calculate need memory for allocate to store dense matrix.
+    * @param rows number of rows in matrix
+    * @param columns number of columns in matrix
+    * @return need number of items to allocate via allocate()
+    * @sa allocate()
+    */
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) {
+  return rows * columns;
+}
+
+/** Helper function to check alignment of pointer.
+    * @param ptr the pointer to check
+    * @param alignment to be checked for
+    * @return true if address in bytes is a multiple of alignment
+    */
+template <typename Type>
+bool is_aligned(Type* ptr, size_t alignment) {
+  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+/** calculate greatest common divisor of two numbers
+* @a integer
+* @b integer
+* @ return gcd of a and b
+*/
+template <typename IntType>
+IntType gcd(IntType a, IntType b) {
+  while (b != 0) {
+    IntType tmp = b;
+    b = a % b;
+    a = tmp;
+  }
+  return a;
+}
+
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
new file mode 100644
index 0000000000..7a454f64e2
--- /dev/null
+++ b/cpp/include/raft/linalg/add.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "binary_op.cuh"
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Elementwise scalar add operation on the input buffer
+ *
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in     the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len    number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
+               cudaStream_t stream) {
+  auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
+  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+}
+
+/**
+ * @brief Elementwise add operation on the input buffers
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in1    the first input buffer
+ * @param in2    the second input buffer
+ * @param len    number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
+         cudaStream_t stream) {
+  auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
+  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+}
+
+template <class math_t, typename IdxType>
+__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
+                                      const math_t *singleScalarDev,
+                                      IdxType len) {
+  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) {
+    outDev[i] = inDev[i] + *singleScalarDev;
+  }
+}
+
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param outDev the output buffer
+ * @param inDev the input buffer
+ * @param singleScalarDev pointer to the scalar located in device memory
+ * @param len number of elements in the input and output buffer
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void addDevScalar(math_t *outDev, const math_t *inDev,
+                  const math_t *singleScalarDev, IdxType len,
+                  cudaStream_t stream) {
+  // TODO: block dimension has not been tuned
+  dim3 block(256);
+  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
+  add_dev_scalar_kernel<math_t>
+    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
new file mode 100644
index 0000000000..f8142d9a82
--- /dev/null
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType>
+__global__ void binaryOpKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len, Lambda op) {
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a, b;
+  OutVecType c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
+  }
+  c.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType, int TPB>
+void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
+                  IdxType len, Lambda op, cudaStream_t stream) {
+  const IdxType nblks =
+    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief perform element-wise binary operation on the input arrays
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
+ */
+template <typename InType, typename Lambda, typename OutType = InType,
+          typename IdxType = int, int TPB = 256>
+void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
+              Lambda op, cudaStream_t stream) {
+  constexpr auto maxSize =
+    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  if (16 / maxSize && bytes % 16 == 0) {
+    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0) {
+    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0) {
+    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0) {
+    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (1 / maxSize) {
+    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else {
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
+                                                           op, stream);
+  }
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
new file mode 100644
index 0000000000..ef983ff3d0
--- /dev/null
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+
+// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
+// of the matrix, i.e. reduce along rows for row major or reduce along columns
+// for column major layout. Kernel does an inplace reduction adding to original
+// values of dots.
+template <typename InType, typename OutType, typename IdxType, int TPB,
+          typename MainLambda, typename ReduceLambda, typename FinalLambda>
+__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
+                                         int D, int N, OutType init,
+                                         MainLambda main_op,
+                                         ReduceLambda reduce_op,
+                                         FinalLambda final_op,
+                                         bool inplace = false) {
+  typedef cub::BlockReduce<OutType, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType thread_data = init;
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    thread_data = reduce_op(thread_data, main_op(data[idx], i));
+  }
+  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc));
+    } else {
+      dots[blockIdx.x] = final_op(acc);
+    }
+  }
+}
+
+/**
+ * @brief Compute reduction of the input matrix along the leading dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda = raft::Nop<OutType>>
+void coalescedReduction(OutType *dots, const InType *data, int D, int N,
+                        OutType init, cudaStream_t stream, bool inplace = false,
+                        MainLambda main_op = raft::Nop<InType, IdxType>(),
+                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                        FinalLambda final_op = raft::Nop<OutType>()) {
+  // One block per reduction
+  // Efficient only for large leading dimensions
+  if (D <= 32) {
+    coalescedReductionKernel<InType, OutType, IdxType, 32>
+      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                             final_op, inplace);
+  } else if (D <= 64) {
+    coalescedReductionKernel<InType, OutType, IdxType, 64>
+      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                             final_op, inplace);
+  } else if (D <= 128) {
+    coalescedReductionKernel<InType, OutType, IdxType, 128>
+      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                              final_op, inplace);
+  } else {
+    coalescedReductionKernel<InType, OutType, IdxType, 256>
+      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                              final_op, inplace);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
new file mode 100644
index 0000000000..c848ac1f4b
--- /dev/null
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
+                  cudaStream_t stream) {
+  unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in / scalar; },
+    stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
new file mode 100644
index 0000000000..6172618380
--- /dev/null
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
+           int n_cols, math_t *eig_vectors, math_t *eig_vals,
+           cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int lwork;
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
+                                            n_cols, eig_vals, &lwork));
+
+  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
+  raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
+
+  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
+
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 d_dev_info.data(), stream));
+  CUDA_CHECK(cudaGetLastError());
+
+  int dev_info;
+  raft::update_host(&dev_info, d_dev_info.data(), 1, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  ASSERT(dev_info == 0,
+         "eig.cuh: eigensolver couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+}
+
+enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
+
+#if CUDART_VERSION >= 10010
+
+/**
+ * @defgroup eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param n_eig_vals: number of eigenvectors to be generated
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
+              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
+              EigVecMemUsage memUsage, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int lwork;
+  int h_meig;
+
+  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
+    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
+
+  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
+  raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
+  raft::mr::device::buffer<math_t> d_eig_vectors(allocator, stream, 0);
+
+  if (memUsage == OVERWRITE_INPUT) {
+    CUSOLVER_CHECK(cusolverDnsyevdx(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
+      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
+      d_dev_info.data(), stream));
+  } else if (memUsage == COPY_INPUT) {
+    d_eig_vectors.resize(n_rows * n_cols, stream);
+    raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
+
+    CUSOLVER_CHECK(cusolverDnsyevdx(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
+      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
+      d_work.data(), lwork, d_dev_info.data(), stream));
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+
+  int dev_info;
+  raft::update_host(&dev_info, d_dev_info.data(), 1, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  ASSERT(dev_info == 0,
+         "eig.cuh: eigensolver couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+
+  if (memUsage == OVERWRITE_INPUT) {
+    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
+                                  stream);
+  } else if (memUsage == COPY_INPUT) {
+    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
+                                  n_rows, n_eig_vals, stream);
+  }
+}
+
+#endif
+
+/**
+ * @defgroup overloaded function for eig decomp with Jacobi method for the
+ * column-major symmetric matrices (in parameter)
+ * @param handle: raft handle
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ * @{
+ */
+template <typename math_t>
+void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
+               int n_cols, math_t *eig_vectors, math_t *eig_vals,
+               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  syevjInfo_t syevj_params = nullptr;
+  CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
+  CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
+  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
+
+  int lwork;
+  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
+    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
+
+  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
+  raft::mr::device::buffer<int> dev_info(allocator, stream, 1);
+
+  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
+
+  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 dev_info.data(), syevj_params, stream));
+
+  int executed_sweeps;
+  CUSOLVER_CHECK(
+    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+
+  CUDA_CHECK(cudaGetLastError());
+  CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
new file mode 100644
index 0000000000..a46d550220
--- /dev/null
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "binary_op.cuh"
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void scalarAdd(math_t *out, const math_t *in, math_t scalar, IdxType len,
+               cudaStream_t stream) {
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in + scalar; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void scalarMultiply(math_t *out, const math_t *in, math_t scalar, IdxType len,
+                    cudaStream_t stream) {
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; },
+    stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void eltwiseAdd(math_t *out, const math_t *in1, const math_t *in2, IdxType len,
+                cudaStream_t stream) {
+  binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return a + b; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void eltwiseSub(math_t *out, const math_t *in1, const math_t *in2, IdxType len,
+                cudaStream_t stream) {
+  binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return a - b; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void eltwiseMultiply(math_t *out, const math_t *in1, const math_t *in2,
+                     IdxType len, cudaStream_t stream) {
+  binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return a * b; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void eltwiseDivide(math_t *out, const math_t *in1, const math_t *in2,
+                   IdxType len, cudaStream_t stream) {
+  binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return a / b; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void eltwiseDivideCheckZero(math_t *out, const math_t *in1, const math_t *in2,
+                            IdxType len, cudaStream_t stream) {
+  binaryOp(
+    out, in1, in2, len,
+    [] __device__(math_t a, math_t b) {
+      if (b == math_t(0.0))
+        return math_t(0.0);
+      else
+        return a / b;
+    },
+    stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
new file mode 100644
index 0000000000..0a4897cc0b
--- /dev/null
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param alpha scalar
+ * @param beta scalar
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
+          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
+          cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
+          math_t beta, cudaStream_t stream) {
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+  int m = n_rows_c;
+  int n = n_cols_c;
+  int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
+  int lda = trans_a == CUBLAS_OP_T ? k : m;
+  int ldb = trans_b == CUBLAS_OP_T ? n : k;
+  int ldc = m;
+  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda,
+                          b, ldb, &beta, c, ldc, stream));
+}
+
+template <typename math_t>
+void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
+          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
+          cublasOperation_t trans_a, cublasOperation_t trans_b,
+          cudaStream_t stream) {
+  math_t alpha = math_t(1);
+  math_t beta = math_t(0);
+  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
+       trans_b, alpha, beta, stream);
+}
+
+/**
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
+ * combinations of operand layouts.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * @tparam T Data type of input/output matrices (float/double)
+ * @param handle raft handle
+ * @param z output matrix of size M rows x N columns
+ * @param x input matrix of size M rows x K columns
+ * @param y input matrix of size K rows x N columns
+ * @param _M number of rows of X and Z
+ * @param _N number of rows of Y and columns of Z
+ * @param _K number of columns of X and rows of Y
+ * @param isZColMajor Storage layout of Z. true = col major, false = row major
+ * @param isXColMajor Storage layout of X. true = col major, false = row major
+ * @param isYColMajor Storage layout of Y. true = col major, false = row major
+ * @param stream cuda stream
+ * @param alpha scalar
+ * @param beta scalar
+ */
+template <typename T>
+void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
+          int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
+          cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+  cublasOperation_t trans_a, trans_b;
+  T *a, *b, *c;
+  int lda, ldb, ldc;
+  int M, N, K;
+  // This function performs c = a * b. Based on the required output layout,
+  // either a = x,  b = y or a = y, b = x. In either case c = z.
+  if (isZColMajor == true) {
+    // Result c is required in column major layout. Thus we perform,
+    // z = x * y
+    // Using BLAS call c = a * b. Therefore a = x, b = y and c = z
+
+    a = x;
+    // If x is in row major layout, cublas needs to transpose x first,
+    // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
+    // layout, trans_b needs to be CUBLAS_OP_N.
+    trans_a = isXColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
+    // Set leading dimension appropriately
+    lda = isXColMajor == true ? _M : _K;
+
+    b = y;
+    // If y is in row major layout, cublas needs to transpose y first,
+    // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
+    // layout, trans_b needs to be CUBLAS_OP_N.
+    trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
+    ldb = isYColMajor == true ? _K : _N;
+
+    c = z;
+    ldc = _M;
+    M = _M;
+    N = _N;
+    K = _K;
+  } else {
+    // Result c is required in row major layout Thus we pick
+    // a = y, b = x and c = a * b = y * x
+    // cublas produces output matrix only in column major layout. To get output
+    // matrix on row major layout, we need to produce transpose of output
+    // in column major layout. Therefore we perform,
+    // tr(z) = tr(y) * tr(x)
+    // we model this using cublas call for c = a * b
+    // therefore a = tr(y), b = tr(x) and c = tr(z)
+
+    a = y;
+    // If y is in row major layout, it can be/ interpreted as tr(y) on column
+    // major layout. Therefore we can pass trans_a as CUBLAS_OP_N. If y is in
+    // column major layout, cublas needs to transpose y first, therefore
+    // trans_a needs to be CUBLAS_OP_T
+    trans_a = isYColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N;
+    // Set leading dimension appropriately
+    lda = isYColMajor == true ? _K : _N;
+
+    b = x;
+    // If x is in row major layout, it can be interpreted as tr(x) on column
+    // major layout. Therefore we can pass trans_b as CUBLAS_OP_N. If x is in
+    // column major layout, cublas needs to trasponse x first, therefore
+    // trans_b needs to be CUBLAS_OP_T
+    trans_b = isXColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N;
+    // Set leading dimension appropriately
+    ldb = isXColMajor == true ? _M : _K;
+
+    c = z;
+    ldc = _N;
+
+    M = _N;
+    N = _M;
+    K = _K;
+  }
+  // Actual cuBLAS call
+  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda,
+                          b, ldb, &beta, c, ldc, stream));
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
new file mode 100644
index 0000000000..edd18b3bee
--- /dev/null
+++ b/cpp/include/raft/linalg/gemv.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols,
+          const math_t* x, int incx, math_t* y, int incy, bool trans_a,
+          math_t alpha, math_t beta, cudaStream_t stream) {
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+  cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  // Unfortunately there is a clash of terminology
+  // in BLAS https://docs.nvidia.com/cuda/cublas/index.html is opposite to Machine Learning
+  // In blas:
+  //  m - number of rows in input matrix
+  //  n - number of columns in input matrix
+  //  lda - purpose of it  to have ability to operate on submatrices of matrix without copying.
+  //        If you're not think about it it's always should be equal to m
+  //  lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose
+
+  // In Machine Learning:
+  //  m - nunmber of columns in design matrix(number of features)
+  //  n - number of rows in designed matrix (number of train examples)
+
+  int m = n_rows;
+  int n = n_cols;
+  int lda = trans_a ? m : n;
+
+  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta,
+                          y, incy, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
+          int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha,
+          math_t beta, cudaStream_t stream) {
+  gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
+          int n_cols_a, const math_t* x, math_t* y, bool trans_a,
+          cudaStream_t stream) {
+  math_t alpha = math_t(1);
+  math_t beta = math_t(0);
+
+  gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
new file mode 100644
index 0000000000..1a6513b915
--- /dev/null
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+
+template <typename Type, int TPB>
+__device__ void reduce(Type *out, const Type acc) {
+  typedef cub::BlockReduce<Type, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  Type tmp = BlockReduce(temp_storage).Sum(acc);
+  if (threadIdx.x == 0) {
+    raft::myAtomicAdd(out, tmp);
+  }
+}
+
+template <typename Type, typename MapOp, int TPB, typename... Args>
+__global__ void mapThenSumReduceKernel(Type *out, size_t len, MapOp map,
+                                       const Type *in, Args... args) {
+  Type acc = (Type)0;
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+
+  if (idx < len) {
+    acc = map(in[idx], args[idx]...);
+  }
+
+  __syncthreads();
+
+  reduce<Type, TPB>(out, acc);
+}
+
+template <typename Type, typename MapOp, int TPB, typename... Args>
+void mapThenSumReduceImpl(Type *out, size_t len, MapOp map, cudaStream_t stream,
+                          const Type *in, Args... args) {
+  CUDA_CHECK(cudaMemsetAsync(out, 0, sizeof(Type), stream));
+  const int nblks = raft::ceildiv(len, (size_t)TPB);
+  mapThenSumReduceKernel<Type, MapOp, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief CUDA version of map and then sum reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output sum-reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename Type, typename MapOp, int TPB = 256, typename... Args>
+void mapThenSumReduce(Type *out, size_t len, MapOp map, cudaStream_t stream,
+                      const Type *in, Args... args) {
+  mapThenSumReduceImpl<Type, MapOp, TPB, Args...>(out, len, map, stream, in,
+                                                  args...);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
new file mode 100644
index 0000000000..902816418f
--- /dev/null
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType>
+__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
+                                     const Type *vector, IdxType D, IdxType N,
+                                     bool rowMajor, bool bcastAlongRows,
+                                     Lambda op) {
+  typedef TxN_t<Type, veclen_> VecType;
+  IdxType len = N * D;
+  IdxType idx = threadIdx.x;
+  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  IdxType vIdx;
+  VecType mat, vec;
+  ///@todo: yikes! use fast-int-div here.
+  ///@todo: shared mem for vector could help with perf
+  if (rowMajor && bcastAlongRows) {
+    vIdx = idx % D;
+    vec.load(vector, vIdx);
+  } else if (!rowMajor && !bcastAlongRows) {
+    vIdx = idx % N;
+    vec.load(vector, vIdx);
+  } else if (rowMajor && !bcastAlongRows) {
+    vIdx = idx / D;
+    vec.fill(vector[vIdx]);
+  } else {
+    vIdx = idx / N;
+    vec.fill(vector[vIdx]);
+  }
+  mat.load(matrix, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i)
+    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
+  mat.store(out, idx);
+}
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType,
+          int TPB>
+void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
+                        IdxType D, IdxType N, bool rowMajor,
+                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType len = N * D;
+  IdxType nblks =
+    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
+                                bcastAlongRows, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Operations for all the columns or rows with a given vector.
+ * @tparam Type the matrix/vector type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output matrix (passing out = matrix makes it in-place)
+ * @param matrix the input matrix
+ * @param vec the vector
+ * @param D number of columns of matrix
+ * @param N number of rows of matrix
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns
+ * @param op the mathematical operation
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
+                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
+                    cudaStream_t stream) {
+  IdxType stride = rowMajor ? D : N;
+  size_t bytes = stride * sizeof(Type);
+  if (16 / sizeof(Type) && bytes % 16 == 0) {
+    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (1 / sizeof(Type)) {
+    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else {
+    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  }
+}
+
+///@todo: come up with a cleaner interface to support these cases in future!
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType>
+__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
+                                     const Type *vector1, const Type *vector2,
+                                     IdxType D, IdxType N, bool rowMajor,
+                                     bool bcastAlongRows, Lambda op) {
+  typedef TxN_t<Type, veclen_> VecType;
+  IdxType len = N * D;
+  IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
+  if (idx >= len) return;
+  IdxType vIdx;
+  VecType mat, vec1, vec2;
+  ///@todo: yikes! use fast-int-div here.
+  ///@todo: shared mem for vector could help with perf
+  if (rowMajor && bcastAlongRows) {
+    vIdx = idx % D;
+    vec1.load(vector1, vIdx);
+    vec2.load(vector2, vIdx);
+  } else if (!rowMajor && !bcastAlongRows) {
+    vIdx = idx % N;
+    vec1.load(vector1, vIdx);
+    vec2.load(vector2, vIdx);
+  } else if (rowMajor && !bcastAlongRows) {
+    vIdx = idx / D;
+    vec1.fill(vector1[vIdx]);
+    vec2.fill(vector2[vIdx]);
+  } else {
+    vIdx = idx / N;
+    vec1.fill(vector1[vIdx]);
+    vec2.fill(vector2[vIdx]);
+  }
+  mat.load(matrix, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i)
+    mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
+  mat.store(out, idx);
+}
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType,
+          int TPB>
+void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
+                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
+                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
+  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
+                                bcastAlongRows, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Operations for all the columns or rows with the given vectors.
+ * @tparam Type the matrix/vector type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output matrix (passing out = matrix makes it in-place)
+ * @param matrix the input matrix
+ * @param vec1 the first vector
+ * @param vec2 the second vector
+ * @param D number of columns of matrix
+ * @param N number of rows of matrix
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns
+ * @param op the mathematical operation
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
+                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
+                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType stride = rowMajor ? D : N;
+  size_t bytes = stride * sizeof(Type);
+  if (16 / sizeof(Type) && bytes % 16 == 0) {
+    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (1 / sizeof(Type)) {
+    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else {
+    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  }
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
new file mode 100644
index 0000000000..9d1538c172
--- /dev/null
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "map_then_reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version mean squared error function mean((A-B)**2)
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam TPB threads-per-block 
+ * @param out the output mean squared error value (assumed to be a device pointer)
+ * @param A input array (assumed to be a device pointer)
+ * @param B input array (assumed to be a device pointer)
+ * @param len number of elements in the input arrays
+ * @param weight weight to apply to every term in the mean squared error calculation
+ * @param stream cuda-stream where to launch this kernel
+ */
+template <typename math_t, int TPB = 256>
+void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len,
+                      math_t weight, cudaStream_t stream) {
+  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
+    math_t diff = a - b;
+    return diff * diff * weight / len;
+  };
+  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A,
+                                                   B);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
new file mode 100644
index 0000000000..ce948c927d
--- /dev/null
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
+                    cudaStream_t stream) {
+  unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; },
+    stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
new file mode 100644
index 0000000000..64930a7123
--- /dev/null
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/** different types of norms supported on the input buffers */
+enum NormType { L1Norm = 0, L2Norm };
+
+/**
+ * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
+ *
+ * Row-wise norm is useful while computing pairwise distance matrix, for
+ * example.
+ * This is used in many clustering algos like knn, kmeans, dbscan, etc... The
+ * current implementation is optimized only for bigger values of 'D'.
+ *
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of row-wise dot products
+ * @param data the input matrix (currently assumed to be row-major)
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param rowMajor whether the input is row-major or not
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int,
+          typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
+             bool rowMajor, cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+  switch (type) {
+    case L1Norm:
+      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
+             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      break;
+    case L2Norm:
+      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
+             raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
+      break;
+    default:
+      ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+/**
+ * @brief Compute column-wise norm of the input matrix and perform fin_op
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of column-wise dot products
+ * @param data the input matrix (currently assumed to be row-major)
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param rowMajor whether the input is row-major or not
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int,
+          typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
+             bool rowMajor, cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+  switch (type) {
+    case L1Norm:
+      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
+             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      break;
+    case L2Norm:
+      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
+             raft::L2Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      break;
+    default:
+      ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
new file mode 100644
index 0000000000..cafa8d54f1
--- /dev/null
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/matrix/matrix.cuh>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup QRdecomp QR decomposition
+ * @{
+ */
+
+/**
+ * @brief compute QR decomp and return only Q matrix
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
+            int n_rows, int n_cols, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int m = n_rows, n = n_cols;
+  int k = min(m, n);
+  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  raft::mr::device::buffer<math_t> tau(allocator, stream, k);
+  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
+
+  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  int Lwork;
+
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
+  raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
+  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
+                                 workspace.data(), Lwork, devInfo.data(),
+                                 stream));
+  /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
+#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
+  CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+  CUSOLVER_CHECK(
+    cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  workspace.resize(Lwork, stream);
+  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
+                                 workspace.data(), Lwork, devInfo.data(),
+                                 stream));
+}
+
+/**
+ * @brief compute QR decomp and return both Q and R matrices
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param R: R matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
+             int n_rows, int n_cols, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int m = n_rows, n = n_cols;
+  raft::mr::device::buffer<math_t> R_full(allocator, stream, m * n);
+  raft::mr::device::buffer<math_t> tau(allocator, stream, min(m, n));
+  CUDA_CHECK(
+    cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  int R_full_nrows = m, R_full_ncols = n;
+  CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  int Lwork;
+  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
+                                            R_full_ncols, R_full.data(),
+                                            R_full_nrows, &Lwork));
+  raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
+  CUSOLVER_CHECK(cusolverDngeqrf(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
+    tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
+#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
+  CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+  raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
+  int Q_nrows = m, Q_ncols = n;
+
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
+                                            min(Q_ncols, Q_nrows), Q, Q_nrows,
+                                            tau.data(), &Lwork));
+  workspace.resize(Lwork, stream);
+  CUSOLVER_CHECK(cusolverDnorgqr(
+    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
+    workspace.data(), Lwork, devInfo.data(), stream));
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
new file mode 100644
index 0000000000..d39577bbdd
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include "coalesced_reduction.cuh"
+#include "strided_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the requested dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D number of columns
+ * @param N number of rows
+ * @param init initial value to use for the reduction
+ * @param rowMajor input matrix is row-major or not
+ * @param alongRows whether to reduce along rows or columns
+ * @param stream cuda stream where to launch work
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ */
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda = raft::Nop<OutType>>
+void reduce(OutType *dots, const InType *data, int D, int N, OutType init,
+            bool rowMajor, bool alongRows, cudaStream_t stream,
+            bool inplace = false,
+            MainLambda main_op = raft::Nop<InType, IdxType>(),
+            ReduceLambda reduce_op = raft::Sum<OutType>(),
+            FinalLambda final_op = raft::Nop<OutType>()) {
+  if (rowMajor && alongRows) {
+    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
+                       reduce_op, final_op);
+  } else if (rowMajor && !alongRows) {
+    stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
+                     reduce_op, final_op);
+  } else if (!rowMajor && alongRows) {
+    stridedReduction(dots, data, N, D, init, stream, inplace, main_op,
+                     reduce_op, final_op);
+  } else {
+    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op,
+                       reduce_op, final_op);
+  }
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
new file mode 100644
index 0000000000..fff09d2046
--- /dev/null
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <type_traits>
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+// Kernel to perform reductions along the strided dimension
+// of the matrix, i.e. reduce along columns for row major or reduce along rows
+// for column major layout
+template <typename Type, typename MainLambda>
+__global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
+                                       int N, Type init, MainLambda main_op) {
+  // Thread reduction
+  Type thread_data = Type(init);
+  int colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  if (colStart < D) {
+    int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
+    int stride = blockDim.y * gridDim.y;
+    for (int j = rowStart; j < N; j += stride) {
+      int idx = colStart + j * D;
+      thread_data += main_op(data[idx], j);
+    }
+  }
+
+  // Block reduction
+  extern __shared__ char tmp[];  // One element per thread in block
+  Type *temp = (Type *)tmp;      // Cast to desired type
+  int myidx = threadIdx.x + blockDim.x * threadIdx.y;
+  temp[myidx] = thread_data;
+  __syncthreads();
+  for (int j = blockDim.y / 2; j > 0; j /= 2) {
+    if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x];
+    __syncthreads();
+  }
+
+  // Grid reduction
+  if ((colStart < D) && (threadIdx.y == 0))
+    raft::myAtomicAdd(dots + colStart, temp[myidx]);
+}
+
+// Kernel to perform reductions along the strided dimension
+// of the matrix, i.e. reduce along columns for row major or reduce along rows
+// for column major layout
+template <typename InType, typename OutType, typename IdxType,
+          typename MainLambda, typename ReduceLambda>
+__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
+                                       int N, OutType init, MainLambda main_op,
+                                       ReduceLambda reduce_op) {
+  // Thread reduction
+  OutType thread_data = init;
+  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  if (colStart < D) {
+    IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
+    IdxType stride = blockDim.y * gridDim.y;
+    for (IdxType j = rowStart; j < N; j += stride) {
+      IdxType idx = colStart + j * D;
+      thread_data = reduce_op(thread_data, main_op(data[idx], j));
+    }
+  }
+
+  // Block reduction
+  extern __shared__ char tmp[];  // One element per thread in block
+  auto *temp = (OutType *)tmp;   // Cast to desired type
+  IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
+  temp[myidx] = thread_data;
+  __syncthreads();
+  for (int j = blockDim.y / 2; j > 0; j /= 2) {
+    if (threadIdx.y < j)
+      temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
+    __syncthreads();
+  }
+
+  // Grid reduction
+  if ((colStart < D) && (threadIdx.y == 0))
+    raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op);
+}
+
+/**
+ * @brief Compute reduction of the input matrix along the strided dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda = raft::Nop<OutType>>
+void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
+                      OutType init, cudaStream_t stream, bool inplace = false,
+                      MainLambda main_op = raft::Nop<InType, IdxType>(),
+                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                      FinalLambda final_op = raft::Nop<OutType>()) {
+  ///@todo: this extra should go away once we have eliminated the need
+  /// for atomics in stridedKernel (redesign for this is already underway)
+  if (!inplace)
+    raft::linalg::unaryOp(
+      dots, dots, D, [init] __device__(OutType a) { return init; }, stream);
+
+  // Arbitrary numbers for now, probably need to tune
+  const dim3 thrds(32, 16);
+  IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
+  elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
+  const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
+                   raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
+  const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
+
+  ///@todo: this complication should go away once we have eliminated the need
+  /// for atomics in stridedKernel (redesign for this is already underway)
+  if (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
+      std::is_same<InType, OutType>::value)
+    stridedSummationKernel<InType>
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
+  else
+    stridedReductionKernel<InType, OutType, IdxType>
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
+                                            reduce_op);
+
+  ///@todo: this complication should go away once we have eliminated the need
+  /// for atomics in stridedKernel (redesign for this is already underway)
+  // Perform final op on output data
+  if (!std::is_same<FinalLambda, raft::Nop<OutType>>::value)
+    raft::linalg::unaryOp(dots, dots, D, final_op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
new file mode 100644
index 0000000000..882c105689
--- /dev/null
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include "binary_op.cuh"
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Elementwise scalar subtraction operation on the input buffer
+ *
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in     the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len    number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
+                    cudaStream_t stream) {
+  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
+  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+}
+
+/**
+ * @brief Elementwise subtraction operation on the input buffers
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in1    the first input buffer
+ * @param in2    the second input buffer
+ * @param len    number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len,
+              cudaStream_t stream) {
+  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
+  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+}
+
+template <class math_t, typename IdxType>
+__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
+                                           const math_t *singleScalarDev,
+                                           IdxType len) {
+  //TODO: kernel do not use shared memory in current implementation
+  int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) {
+    outDev[i] = inDev[i] - *singleScalarDev;
+  }
+}
+
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param outDev the output buffer
+ * @param inDev the input buffer
+ * @param singleScalarDev pointer to the scalar located in device memory
+ * @param len number of elements in the input and output buffer
+ * @param stream cuda stream
+ * @remark block size has not been tuned
+ */
+template <typename math_t, typename IdxType = int, int TPB = 256>
+void subtractDevScalar(math_t *outDev, const math_t *inDev,
+                       const math_t *singleScalarDev, IdxType len,
+                       cudaStream_t stream) {
+  // Just for the note - there is no way to express such operation with cuBLAS in effective way
+  // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
+  const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
+  subtract_dev_scalar_kernel<math_t>
+    <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
new file mode 100644
index 0000000000..7fb22bb2da
--- /dev/null
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/mr/device/buffer.hpp>
+#include "eig.cuh"
+#include "gemm.cuh"
+#include "transpose.h"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief singular value decomposition (SVD) on the column major float type
+ * input matrix using QR method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular values of input matrix
+ * @param right_sing_vecs: right singular values of input matrix
+ * @param trans_right: transpose right vectors or not
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param stream cuda stream
+ */
+// TODO: activate gen_left_vec and gen_right_vec options
+// TODO: couldn't template this function due to cusolverDnSgesvd and
+// cusolverSnSgesvd. Check if there is any other way.
+template <typename T>
+void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
+           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
+           bool trans_right, bool gen_left_vec, bool gen_right_vec,
+           cudaStream_t stream) {
+  std::shared_ptr<raft::mr::device::allocator> allocator =
+    handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+
+#if CUDART_VERSION >= 10010
+  // 46340: sqrt of max int value
+  ASSERT(n_rows <= 46340,
+         "svd solver is not supported for the data that has more than 46340 "
+         "samples (rows) "
+         "if you are using CUDA version 10.1. Please use other solvers such as "
+         "eig if it is available.");
+#endif
+
+  const int m = n_rows;
+  const int n = n_cols;
+
+  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  T *d_rwork = nullptr;
+
+  int lwork = 0;
+  CUSOLVER_CHECK(
+    cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  raft::mr::device::buffer<T> d_work(allocator, stream, lwork);
+
+  char jobu = 'S';
+  char jobvt = 'A';
+
+  if (!gen_left_vec) {
+    char new_u = 'N';
+    strcpy(&jobu, &new_u);
+  }
+
+  if (!gen_right_vec) {
+    char new_vt = 'N';
+    strcpy(&jobvt, &new_vt);
+  }
+
+  CUSOLVER_CHECK(cusolverDngesvd(
+    cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
+    right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
+
+  // Transpose the right singular vector back
+  if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
+
+  CUDA_CHECK(cudaGetLastError());
+
+  int dev_info;
+  raft::update_host(&dev_info, devInfo.data(), 1, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  ASSERT(dev_info == 0,
+         "svd.cuh: svd couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+}
+
+template <typename T>
+void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
+            T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+
+  int len = n_cols * n_cols;
+  raft::mr::device::buffer<T> in_cross_mult(allocator, stream, len);
+
+  T alpha = T(1);
+  T beta = T(0);
+  raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(),
+                     n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
+                     stream);
+
+  eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
+
+  raft::matrix::colReverse(V, n_cols, n_cols, stream);
+  raft::matrix::rowReverse(S, n_cols, 1, stream);
+
+  raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true);
+
+  if (gen_left_vec) {
+    raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols,
+                       CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false,
+                                                true, stream);
+  }
+}
+
+/**
+ * @brief on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular vectors of input matrix
+ * @param right_sing_vecs: right singular vectors of input matrix
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
+               math_t *sing_vals, math_t *left_sing_vecs,
+               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
+               math_t tol, int max_sweeps, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  gesvdjInfo_t gesvdj_params = NULL;
+
+  CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
+  CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps));
+
+  int m = n_rows;
+  int n = n_cols;
+
+  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+
+  int lwork = 0;
+  int econ = 1;
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
+    left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
+
+  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
+    left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
+    gesvdj_params, stream));
+
+  CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param U: left singular vectors of size n_rows x k
+ * @param S: square matrix with singular values on its diagonal, k x k
+ * @param V: right singular vectors of size n_cols x k
+ * @param out: reconstructed matrix to be returned
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
+                       math_t *V, math_t *out, int n_rows, int n_cols, int k,
+                       cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+
+  const math_t alpha = 1.0, beta = 0.0;
+  raft::mr::device::buffer<math_t> SVT(allocator, stream, k * n_cols);
+
+  raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
+                     CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols,
+                     CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param A_d: input matrix
+ * @param U: left singular vectors of size n_rows x k
+ * @param S_vec: singular values as a vector
+ * @param V: right singular vectors of size n_cols x k
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values to be computed, 1.0 for normal SVD
+ * @param tol: tolerance for the evaluation
+ * @param stream cuda stream
+ */
+template <typename math_t>
+bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
+                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
+                         int k, math_t tol, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+
+  int m = n_rows, n = n_cols;
+
+  // form product matrix
+  raft::mr::device::buffer<math_t> P_d(allocator, stream, m * n);
+  raft::mr::device::buffer<math_t> S_mat(allocator, stream, k * k);
+  CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
+
+  raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream);
+  svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream);
+
+  // get norms of each
+  math_t normA = raft::matrix::getL2Norm(handle, A_d, m * n, stream);
+  math_t normU = raft::matrix::getL2Norm(handle, U, m * k, stream);
+  math_t normS = raft::matrix::getL2Norm(handle, S_mat.data(), k * k, stream);
+  math_t normV = raft::matrix::getL2Norm(handle, V, n * k, stream);
+  math_t normP = raft::matrix::getL2Norm(handle, P_d.data(), m * n, stream);
+
+  // calculate percent error
+  const math_t alpha = 1.0, beta = -1.0;
+  raft::mr::device::buffer<math_t> A_minus_P(allocator, stream, m * n);
+  CUDA_CHECK(
+    cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
+                                        &alpha, A_d, m, &beta, P_d.data(), m,
+                                        A_minus_P.data(), m, stream));
+
+  math_t norm_A_minus_P =
+    raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
+  math_t percent_error = 100.0 * norm_A_minus_P / normA;
+  return (percent_error / 100.0 < tol);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
new file mode 100644
index 0000000000..d90f6271fa
--- /dev/null
+++ b/cpp/include/raft/linalg/transpose.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/cublas_wrappers.h>
+#include <thrust/device_vector.h>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param out: output. Transposed input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
+               int n_rows, int n_cols, cudaStream_t stream) {
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+  int out_n_rows = n_cols;
+  int out_n_cols = n_rows;
+
+  const math_t alpha = 1.0;
+  const math_t beta = 0.0;
+  CUBLAS_CHECK(raft::linalg::cublasgeam(
+    cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in,
+    n_rows, &beta, out, out_n_rows, out, out_n_rows, stream));
+}
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param inout: input and output matrix
+ * @param n: number of rows and columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(math_t *inout, int n, cudaStream_t stream) {
+  auto m = n;
+  auto size = n * n;
+  auto d_inout = inout;
+  auto counting = thrust::make_counting_iterator<int>(0);
+
+  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+                   [=] __device__(int idx) {
+                     int s_row = idx % m;
+                     int s_col = idx / m;
+                     int d_row = s_col;
+                     int d_col = s_row;
+                     if (s_row < s_col) {
+                       auto temp = d_inout[d_col * m + d_row];
+                       d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
+                       d_inout[s_col * m + s_row] = temp;
+                     }
+                   });
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
new file mode 100644
index 0000000000..46b4d296cb
--- /dev/null
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType>
+__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
+                              Lambda op) {
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a;
+  OutVecType b;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    b.val.data[i] = op(a.val.data[i]);
+  }
+  b.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType, int TPB>
+void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
+                 cudaStream_t stream) {
+  const IdxType nblks =
+    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, len, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief perform element-wise unary operation in the input array
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in the input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val);`
+ */
+template <typename InType, typename Lambda, typename IdxType = int,
+          typename OutType = InType, int TPB = 256>
+void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
+             cudaStream_t stream) {
+  if (len <= 0) return;  //silently skip in case of 0 length input
+  constexpr auto maxSize =
+    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  uint64_t inAddr = uint64_t(in);
+  uint64_t outAddr = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
+      outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
+             outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
+             outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
+             outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (1 / maxSize) {
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else {
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
+                                                          stream);
+  }
+}
+
+template <typename OutType, typename Lambda, typename IdxType>
+__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  if (idx < len) {
+    op(out + idx, idx);
+  }
+}
+
+/**
+ * @brief Perform an element-wise unary operation into the output array
+ *
+ * Compared to `unaryOp()`, this method does not do any reads from any inputs
+ *
+ * @tparam OutType output data-type
+ * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB     threads-per-block in the final kernel launched
+ *
+ * @param[out] out    the output array [on device] [len = len]
+ * @param[in]  len    number of elements in the input array
+ * @param[in]  op     the device-lambda which must be of the form:
+ *                    `void func(OutType* outLocationOffset, IdxType idx);`
+ *                    where outLocationOffset will be out + idx.
+ * @param[in]  stream cuda stream where to launch work
+ */
+template <typename OutType, typename Lambda, typename IdxType = int,
+          int TPB = 256>
+void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op,
+                      cudaStream_t stream) {
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  auto nblks = raft::ceildiv<IdxType>(len, TPB);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, len, op);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
new file mode 100644
index 0000000000..0a72117140
--- /dev/null
+++ b/cpp/include/raft/matrix/math.cuh
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace matrix {
+
+/**
+ * @defgroup MatrixMathOp math operation on the input matrix
+ * @{
+ */
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param in: input matrix
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar.
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t *in, math_t *out, math_t scalar, int len,
+           cudaStream_t stream) {
+  auto d_src = in;
+  auto d_dest = out;
+
+  raft::linalg::binaryOp(
+    d_dest, d_src, d_src, len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar.
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
+  power(inout, inout, scalar, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t *inout, int len, cudaStream_t stream) {
+  math_t scalar = 1.0;
+  power(inout, scalar, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param in: input matrix
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
+  math_t scalar = 1.0;
+  power(in, out, scalar, len, stream);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param set_neg_zero whether to set negative numbers to zero
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
+             cudaStream_t stream, bool set_neg_zero = false) {
+  auto d_src = in;
+  auto d_dest = out;
+
+  raft::linalg::unaryOp(
+    d_dest, d_src, len,
+    [=] __device__(math_t a) {
+      if (set_neg_zero) {
+        if (a < math_t(0)) {
+          return math_t(0);
+        } else {
+          return sqrt(a * scalar);
+        }
+      } else {
+        return sqrt(a * scalar);
+      }
+    },
+    stream);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param set_neg_zero whether to set negative numbers to zero
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
+             bool set_neg_zero = false) {
+  seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+  math_t scalar = 1.0;
+  seqRoot(in, out, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) {
+  math_t scalar = 1.0;
+  seqRoot(inout, inout, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
+                        cudaStream_t stream, math_t thres = 1e-15) {
+  raft::linalg::unaryOp(
+    out, in, len,
+    [=] __device__(math_t a) {
+      if (a <= thres && -a <= thres) {
+        return math_t(0);
+      } else {
+        return a;
+      }
+    },
+    stream);
+}
+
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres: threshold
+ */
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
+                        math_t thres = 1e-15) {
+  setSmallValuesZero(inout, inout, len, stream, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
+                cudaStream_t stream, bool setzero = false,
+                math_t thres = 1e-15) {
+  auto d_src = in;
+  auto d_dest = out;
+
+  raft::linalg::unaryOp(
+    d_dest, d_src, len,
+    [=] __device__(math_t a) {
+      if (setzero) {
+        if (abs(a) <= thres) {
+          return math_t(0);
+        } else {
+          return scalar / a;
+        }
+      } else {
+        return scalar / a;
+      }
+    },
+    stream);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param setzero: (default false) when true and |value|<thres, avoid dividing by (almost) zero
+ * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
+                bool setzero = false, math_t thres = 1e-15) {
+  reciprocal(inout, inout, scalar, len, stream, setzero, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
+  math_t scalar = 1.0;
+  reciprocal(inout, scalar, len, stream);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+  math_t scalar = 1.0;
+  reciprocal(in, out, scalar, len, stream);
+}
+
+template <typename math_t>
+void setValue(math_t *out, const math_t *in, math_t scalar, int len,
+              cudaStream_t stream = 0) {
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
+}
+
+/**
+ * @brief ratio of every element over sum of input vector is calculated
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param src: input matrix
+ * @param dest: output matrix. The result is stored in the dest matrix
+ * @param len: number elements of input matrix
+ * @param allocator device allocator
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
+           cudaStream_t stream) {
+  auto d_src = src;
+  auto d_dest = dest;
+
+  std::shared_ptr<raft::mr::device::allocator> allocator =
+    handle.get_device_allocator();
+
+  raft::mr::device::buffer<math_t> d_sum(allocator, stream, 1);
+  auto *d_sum_ptr = d_sum.data();
+  auto no_op = [] __device__(math_t in) { return in; };
+  raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
+  raft::linalg::unaryOp(
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); },
+    stream);
+}
+
+/** @} */
+
+// Computes the argmax(d_in) column-wise in a DxN matrix
+template <typename T, int TPB>
+__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // compute maxIndex=argMax  index for column
+  using KVP = cub::KeyValuePair<int, T>;
+  int rowStart = blockIdx.x * D;
+  KVP thread_data(-1, -raft::myInf<T>());
+
+  for (int i = threadIdx.x; i < D; i += TPB) {
+    int idx = rowStart + i;
+    thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
+  }
+
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+
+  if (threadIdx.x == 0) {
+    argmax[blockIdx.x] = maxKV.key;
+  }
+}
+
+/**
+ * @brief Argmax: find the row idx with maximum value for each column
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output vector of size n_cols
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
+            cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
+  if (D <= 32) {
+    argmaxKernel<math_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
+  } else if (D <= 64) {
+    argmaxKernel<math_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
+  } else if (D <= 128) {
+    argmaxKernel<math_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
+  } else {
+    argmaxKernel<math_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+// Utility kernel needed for signFlip.
+// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
+// flipping the sign if the |max| value for each column is negative.
+template <typename T, int TPB>
+__global__ void signFlipKernel(T *d_in, int D, int N) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // compute maxIndex=argMax (with abs()) index for column
+  using KVP = cub::KeyValuePair<int, T>;
+  int rowStart = blockIdx.x * D;
+  KVP thread_data(0, 0);
+  for (int i = threadIdx.x; i < D; i += TPB) {
+    int idx = rowStart + i;
+    thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
+  }
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+
+  // flip column sign if d_in[maxIndex] < 0
+  __shared__ bool need_sign_flip;
+  if (threadIdx.x == 0) {
+    need_sign_flip = d_in[maxKV.key] < T(0);
+  }
+  __syncthreads();
+
+  if (need_sign_flip) {
+    for (int i = threadIdx.x; i < D; i += TPB) {
+      int idx = rowStart + i;
+      d_in[idx] = -d_in[idx];
+    }
+  }
+}
+
+/**
+ * @brief sign flip for PCA. This is used to stabilize the sign of column
+ * major eigen vectors. Flips the sign if the column has negative |max|.
+ * @param inout: input matrix. Result also stored in this parameter
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
+  auto data = inout;
+  if (D <= 32) {
+    signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
+  } else if (D <= 64) {
+    signFlipKernel<math_t, 64><<<N, 64, 0, stream>>>(data, D, N);
+  } else if (D <= 128) {
+    signFlipKernel<math_t, 128><<<N, 128, 0, stream>>>(data, D, N);
+  } else {
+    signFlipKernel<math_t, 256><<<N, 256, 0, stream>>>(data, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row,
+                            IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                            cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; }, stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
+                                    IdxType n_col, bool rowMajor,
+                                    bool bcastAlongRows, cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) {
+      if (b == Type(0))
+        return a;
+      else
+        return a * b;
+    },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row,
+                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                           cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; }, stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
+                                   IdxType n_col, bool rowMajor,
+                                   bool bcastAlongRows, cudaStream_t stream,
+                                   bool return_zero = false) {
+  if (return_zero) {
+    raft::linalg::matrixVectorOp(
+      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      [] __device__(Type a, Type b) {
+        if (raft::myAbs(b) < Type(1e-10))
+          return Type(0);
+        else
+          return a / b;
+      },
+      stream);
+  } else {
+    raft::linalg::matrixVectorOp(
+      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      [] __device__(Type a, Type b) {
+        if (raft::myAbs(b) < Type(1e-10))
+          return a;
+        else
+          return a / b;
+      },
+      stream);
+  }
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row,
+                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                           cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; }, stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row,
+                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                           cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; }, stream);
+}
+
+};  // end namespace matrix
+};  // end namespace raft
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
new file mode 100644
index 0000000000..ec7ea984db
--- /dev/null
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <algorithm>
+#include <cstddef>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace matrix {
+
+using namespace std;
+
+/**
+ * @brief Copy selected rows of the input matrix into contiguous space.
+ *
+ * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows],
+ * where i = 0..n_rows_indices-1, and k = 0..n_cols-1.
+ *
+ * @param in input matrix
+ * @param n_rows number of rows of output matrix
+ * @param n_cols number of columns of output matrix
+ * @param out output matrix
+ * @param indices of the rows to be copied
+ * @param n_rows_indices number of rows to copy
+ * @param stream cuda stream
+ * @param rowMajor whether the matrix has row major layout
+ */
+template <typename m_t>
+void copyRows(const m_t *in, int n_rows, int n_cols, m_t *out,
+              const int *indices, int n_rows_indices, cudaStream_t stream,
+              bool rowMajor = false) {
+  if (rowMajor) {
+    ASSERT(false, "matrix.h: row major is not supported yet!");
+  }
+
+  auto size = n_rows_indices * n_cols;
+  auto counting = thrust::make_counting_iterator<int>(0);
+
+  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+                   [=] __device__(int idx) {
+                     int row = idx % n_rows_indices;
+                     int col = idx / n_rows_indices;
+
+                     out[col * n_rows_indices + row] =
+                       in[col * n_rows + indices[row]];
+                   });
+}
+
+/**
+ * @brief copy matrix operation for column major matrices.
+ * @param in: input matrix
+ * @param out: output matrix
+ * @param n_rows: number of rows of output matrix
+ * @param n_cols: number of columns of output matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void copy(const m_t *in, m_t *out, int n_rows, int n_cols,
+          cudaStream_t stream) {
+  raft::copy_async(out, in, n_rows * n_cols, stream);
+}
+
+/**
+ * @brief copy matrix operation for column major matrices. First n_rows and
+ * n_cols of input matrix "in" is copied to "out" matrix.
+ * @param in: input matrix
+ * @param in_n_rows: number of rows of input matrix
+ * @param out: output matrix
+ * @param out_n_rows: number of rows of output matrix
+ * @param out_n_cols: number of columns of output matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void truncZeroOrigin(m_t *in, int in_n_rows, m_t *out, int out_n_rows,
+                     int out_n_cols, cudaStream_t stream) {
+  auto m = out_n_rows;
+  auto k = in_n_rows;
+  auto size = out_n_rows * out_n_cols;
+  auto d_q = in;
+  auto d_q_trunc = out;
+  auto counting = thrust::make_counting_iterator<int>(0);
+
+  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+                   [=] __device__(int idx) {
+                     int row = idx % m;
+                     int col = idx / m;
+                     d_q_trunc[col * m + row] = d_q[col * k + row];
+                   });
+}
+
+/**
+ * @brief Columns of a column major matrix is reversed (i.e. first column and
+ * last column are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void colReverse(m_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+  auto n = n_cols;
+  auto m = n_rows;
+  auto size = n_rows * n_cols;
+  auto d_q = inout;
+  auto d_q_reversed = inout;
+  auto counting = thrust::make_counting_iterator<int>(0);
+
+  thrust::for_each(thrust::cuda::par.on(stream), counting,
+                   counting + (size / 2), [=] __device__(int idx) {
+                     int dest_row = idx % m;
+                     int dest_col = idx / m;
+                     int src_row = dest_row;
+                     int src_col = (n - dest_col) - 1;
+                     m_t temp = (m_t)d_q_reversed[idx];
+                     d_q_reversed[idx] = d_q[src_col * m + src_row];
+                     d_q[src_col * m + src_row] = temp;
+                   });
+}
+
+/**
+ * @brief Rows of a column major matrix is reversed (i.e. first row and last
+ * row are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void rowReverse(m_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+  auto m = n_rows;
+  auto size = n_rows * n_cols;
+  auto d_q = inout;
+  auto d_q_reversed = inout;
+  auto counting = thrust::make_counting_iterator<int>(0);
+
+  thrust::for_each(thrust::cuda::par.on(stream), counting,
+                   counting + (size / 2), [=] __device__(int idx) {
+                     int dest_row = idx % m;
+                     int dest_col = idx / m;
+                     int src_row = (m - dest_row) - 1;
+                     ;
+                     int src_col = dest_col;
+
+                     m_t temp = (m_t)d_q_reversed[idx];
+                     d_q_reversed[idx] = d_q[src_col * m + src_row];
+                     d_q[src_col * m + src_row] = temp;
+                   });
+}
+
+/**
+ * @brief Prints the data stored in GPU memory
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param h_separator: horizontal separator character
+ * @param v_separator: vertical separator character
+ */
+template <typename m_t>
+void print(const m_t *in, int n_rows, int n_cols, char h_separator = ' ',
+           char v_separator = '\n') {
+  std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
+  CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t),
+                        cudaMemcpyDeviceToHost));
+
+  for (auto i = 0; i < n_rows; i++) {
+    for (auto j = 0; j < n_cols; j++) {
+      printf("%1.4f%c", h_matrix[j * n_rows + i],
+             j < n_cols - 1 ? h_separator : v_separator);
+    }
+  }
+}
+
+/**
+ * @brief Prints the data stored in CPU memory
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ */
+template <typename m_t>
+void printHost(const m_t *in, int n_rows, int n_cols) {
+  for (auto i = 0; i < n_rows; i++) {
+    for (auto j = 0; j < n_cols; j++) {
+      printf("%1.4f ", in[j * n_rows + i]);
+    }
+    printf("\n");
+  }
+}
+
+/**
+ * @brief Kernel for copying a slice of a big matrix to a small matrix with a
+ * size matches that slice
+ * @param src_d: input matrix
+ * @param m: number of rows of input matrix
+ * @param n: number of columns of input matrix
+ * @param dst_d: output matrix
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ */
+template <typename m_t>
+__global__ void slice(m_t *src_d, int m, int n, m_t *dst_d, int x1, int y1,
+                      int x2, int y2) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int dm = x2 - x1, dn = y2 - y1;
+  if (idx < dm * dn) {
+    int i = idx % dm, j = idx / dm;
+    int is = i + x1, js = j + y1;
+    dst_d[idx] = src_d[is + js * m];
+  }
+}
+
+/**
+ * @brief Slice a matrix (in-place)
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output matrix
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
+ * 3, 0, 1, 4, 3);
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void sliceMatrix(m_t *in, int n_rows, int n_cols, m_t *out, int x1, int y1,
+                 int x2, int y2, cudaStream_t stream) {
+  // Slicing
+  dim3 block(64);
+  dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
+  slice<<<grid, block, 0, stream>>>(in, n_rows, n_cols, out, x1, y1, x2, y2);
+}
+
+/**
+ * @brief Kernel for copying the upper triangular part of a matrix to another
+ * @param src: input matrix with a size of mxn
+ * @param dst: output matrix with a size of kxk
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param k: min(n_rows, n_cols)
+ */
+template <typename m_t>
+__global__ void getUpperTriangular(m_t *src, m_t *dst, int n_rows, int n_cols,
+                                   int k) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int m = n_rows, n = n_cols;
+  if (idx < m * n) {
+    int i = idx % m, j = idx / m;
+    if (i < k && j < k && j >= i) {
+      dst[i + j * k] = src[idx];
+    }
+  }
+}
+
+/**
+ * @brief Copy the upper triangular part of a matrix to another
+ * @param src: input matrix with a size of n_rows x n_cols
+ * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void copyUpperTriangular(m_t *src, m_t *dst, int n_rows, int n_cols,
+                         cudaStream_t stream) {
+  int m = n_rows, n = n_cols;
+  int k = min(m, n);
+  dim3 block(64);
+  dim3 grid((m * n + block.x - 1) / block.x);
+  getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
+}
+
+/**
+ * @brief Copy a vector to the diagonal of a matrix
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param m: number of rows of the matrix
+ * @param n: number of columns of the matrix
+ * @param k: dimensionality
+ */
+template <typename m_t>
+__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, int m, int n,
+                                           int k) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < k) {
+    matrix[idx + idx * m] = vec[idx];
+  }
+}
+
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param n_rows: number of rows of the matrix
+ * @param n_cols: number of columns of the matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void initializeDiagonalMatrix(m_t *vec, m_t *matrix, int n_rows, int n_cols,
+                              cudaStream_t stream) {
+  int k = min(n_rows, n_cols);
+  dim3 block(64);
+  dim3 grid((k + block.x - 1) / block.x);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
+                                                         n_cols, k);
+}
+
+/**
+ * @brief Calculate the inverse of the diagonal of a square matrix
+ * element-wise and in place
+ * @param in: square input matrix with size len x len
+ * @param len: size of one side of the matrix
+ */
+template <typename m_t>
+__global__ void matrixDiagonalInverse(m_t *in, int len) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < len) {
+    in[idx + idx * len] = 1.0 / in[idx + idx * len];
+  }
+}
+
+/**
+ * @brief Get a square matrix with elements on diagonal reversed (in-place)
+ * @param in: square input matrix with size len x len
+ * @param len: size of one side of the matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+void getDiagonalInverseMatrix(m_t *in, int len, cudaStream_t stream) {
+  dim3 block(64);
+  dim3 grid((len + block.x - 1) / block.x);
+  matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
+}
+
+/**
+ * @brief Get the L2/F-norm of a matrix/vector
+ * @param in: input matrix/vector with totally size elements
+ * @param size: size of the matrix/vector
+ * @param cublasH cublas handle
+ * @param stream: cuda stream
+ */
+template <typename m_t>
+m_t getL2Norm(const raft::handle_t &handle, m_t *in, int size,
+              cudaStream_t stream) {
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+  m_t normval = 0;
+  CUBLAS_CHECK(
+    raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  return normval;
+}
+
+};  // end namespace matrix
+};  // end namespace raft
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
new file mode 100644
index 0000000000..56710ea81f
--- /dev/null
+++ b/cpp/include/raft/random/rng.cuh
@@ -0,0 +1,676 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <raft/common/cub_wrappers.cuh>
+#include <raft/common/scatter.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+#include <random>
+#include <type_traits>
+#include "rng_impl.cuh"
+
+namespace raft {
+namespace random {
+
+/** all different generator types used */
+enum GeneratorType {
+  /** curand-based philox generator */
+  GenPhilox = 0,
+  /** LFSR taps generator */
+  GenTaps,
+  /** kiss99 generator (currently the fastest) */
+  GenKiss99
+};
+
+template <typename OutType, typename MathType, typename GenType,
+          typename LenType, typename Lambda>
+__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
+                           LenType len, Lambda randOp) {
+  LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
+  const LenType stride = gridDim.x * blockDim.x;
+  for (LenType idx = tid; idx < len; idx += stride) {
+    MathType val;
+    gen.next(val);
+    ptr[idx] = randOp(val, idx);
+  }
+}
+
+// used for Box-Muller type transformations
+template <typename OutType, typename MathType, typename GenType,
+          typename LenType, typename Lambda2>
+__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
+                            LenType len, Lambda2 rand2Op) {
+  LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
+  const LenType stride = gridDim.x * blockDim.x;
+  for (LenType idx = tid; idx < len; idx += stride) {
+    MathType val1, val2;
+    gen.next(val1);
+    gen.next(val2);
+    rand2Op(val1, val2, idx, idx + stride);
+    if (idx < len) ptr[idx] = (OutType)val1;
+    idx += stride;
+    if (idx < len) ptr[idx] = (OutType)val2;
+  }
+}
+
+template <typename Type>
+__global__ void constFillKernel(Type *ptr, int len, Type val) {
+  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  const unsigned stride = gridDim.x * blockDim.x;
+  for (unsigned idx = tid; idx < len; idx += stride) {
+    ptr[idx] = val;
+  }
+}
+
+/**
+ * @brief Helper method to compute Box Muller transform
+ *
+ * @tparam Type data type
+ *
+ * @param[inout] val1   first value
+ * @param[inout] val2   second value
+ * @param[in]    sigma1 standard deviation of output gaussian for first value
+ * @param[in]    mu1    mean of output gaussian for first value
+ * @param[in]    sigma2 standard deviation of output gaussian for second value
+ * @param[in]    mu2    mean of output gaussian for second value
+ * @{
+ */
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
+                             Type sigma2, Type mu2) {
+  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
+  constexpr Type minus2 = -Type(2.0);
+  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta = twoPi * val2;
+  Type s, c;
+  raft::mySinCos(theta, s, c);
+  val1 = R * c * sigma1 + mu1;
+  val2 = R * s * sigma2 + mu2;
+}
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+  box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
+}
+/** @} */
+
+/** The main random number generator class, fully on GPUs */
+class Rng {
+ public:
+  /**
+   * @brief ctor
+   * @param _s 64b seed used to initialize the RNG
+   * @param _t backend device RNG generator type
+   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   */
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox)
+    : type(_t),
+      offset(0),
+      // simple heuristic to make sure all SMs will be occupied properly
+      // and also not too many initialization calls will be made by each thread
+      nBlocks(4 * getMultiProcessorCount()),
+      gen() {
+    seed(_s);
+  }
+
+  /**
+   * @brief Seed (and thus re-initialize) the underlying RNG engine
+   * @param _s 64b seed used to initialize the RNG
+   * @note If you need non-reproducibility, pass a seed that's, for example, a
+   *       function of timestamp. Another example is to use the c++11's
+   *       `std::random_device` for setting seed.
+   */
+  void seed(uint64_t _s) {
+    gen.seed(_s);
+    offset = 0;
+  }
+
+  /**
+   * @brief Generates the 'a' and 'b' parameters for a modulo affine
+   *        transformation equation: `(ax + b) % n`
+   *
+   * @tparam IdxT integer type
+   *
+   * @param[in]  n the modulo range
+   * @param[out] a slope parameter
+   * @param[out] b intercept parameter
+   */
+  template <typename IdxT>
+  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+    // always keep 'a' to be coprime to 'n'
+    a = gen() % n;
+    while (gcd(a, n) != 1) {
+      ++a;
+      if (a >= n) a = 0;
+    }
+    // the bias term 'b' can be any number in the range of [0, n)
+    b = gen() % n;
+  }
+
+  /**
+   * @brief Generate uniformly distributed numbers in the given range
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param start start of the range
+   * @param end end of the range
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename Type, typename LenType = int>
+  void uniform(Type *ptr, LenType len, Type start, Type end,
+               cudaStream_t stream) {
+    static_assert(std::is_floating_point<Type>::value,
+                  "Type for 'uniform' can only be floating point!");
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        return (val * (end - start)) + start;
+      },
+      stream);
+  }
+  template <typename IntType, typename LenType = int>
+  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
+                  cudaStream_t stream) {
+    static_assert(std::is_integral<IntType>::value,
+                  "Type for 'uniformInt' can only be integer!");
+    custom_distribution(
+      ptr, len,
+      [=] __device__(IntType val, LenType idx) {
+        return (val % (end - start)) + start;
+      },
+      stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename Type, typename LenType = int>
+  void normal(Type *ptr, LenType len, Type mu, Type sigma,
+              cudaStream_t stream) {
+    static_assert(std::is_floating_point<Type>::value,
+                  "Type for 'normal' can only be floating point!");
+    rand2Impl(
+      offset, ptr, len,
+      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
+        box_muller_transform<Type>(val1, val2, sigma, mu);
+      },
+      NumThreads, nBlocks, type, stream);
+  }
+  template <typename IntType, typename LenType = int>
+  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
+                 cudaStream_t stream) {
+    static_assert(std::is_integral<IntType>::value,
+                  "Type for 'normalInt' can only be integer!");
+    rand2Impl<IntType, double>(
+      offset, ptr, len,
+      [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) {
+        box_muller_transform<double>(val1, val2, sigma, mu);
+      },
+      NumThreads, nBlocks, type, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed table according to the given set of
+   * means and scalar standard deviations.
+   *
+   * Each row in this table conforms to a normally distributed n-dim vector
+   * whose mean is the input vector and standard deviation is the corresponding
+   * vector or scalar. Correlations among the dimensions itself is assumed to
+   * be absent.
+   *
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output table (dim = n_rows x n_cols)
+   * @param n_rows number of rows in the table
+   * @param n_cols number of columns in the table
+   * @param mu mean vector (dim = n_cols x 1).
+   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
+   * a nullptr to use the same scalar 'sigma' across all components
+   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
+                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
+    rand2Impl(
+      offset, ptr, n_rows * n_cols,
+      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
+        // yikes! use fast-int-div
+        auto col1 = idx1 % n_cols;
+        auto col2 = idx2 % n_cols;
+        auto mean1 = mu[col1];
+        auto mean2 = mu[col2];
+        auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1];
+        auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2];
+        box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
+      },
+      NumThreads, nBlocks, type, stream);
+  }
+
+  /**
+   * @brief Fill an array with the given value
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param val value to be filled
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
+    constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
+
+  /**
+   * @brief Generate bernoulli distributed boolean array
+   *
+   * @tparam Type    data type in which to compute the probabilities
+   * @tparam OutType output data type
+   * @tparam LenType data type used to represent length of the arrays
+   *
+   * @param[out] ptr    the output array
+   * @param[in]  len    the number of elements in the output
+   * @param[in]  prob   coin-toss probability for heads
+   * @param[in]  stream stream where to launch the kernel
+   */
+  template <typename Type, typename OutType = bool, typename LenType = int>
+  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+    custom_distribution<OutType, Type>(
+      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; },
+      stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed array and applies scale
+   * @tparam Type data type in which to compute the probabilities
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param prob coin-toss probability for heads
+   * @param scale scaling factor
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
+                        cudaStream_t stream) {
+    static_assert(std::is_floating_point<Type>::value,
+                  "Type for 'scaled_bernoulli' can only be floating point!");
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        return val > prob ? -scale : scale;
+      },
+      stream);
+  }
+
+  /**
+   * @brief Generate gumbel distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param beta scale value
+   * @param stream stream where to launch the kernel
+   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
+   */
+  template <typename Type, typename LenType = int>
+  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        return mu - beta * raft::myLog(-raft::myLog(val));
+      },
+      stream);
+  }
+
+  /**
+   * @brief Generate lognormal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
+                 cudaStream_t stream) {
+    rand2Impl(
+      offset, ptr, len,
+      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
+        box_muller_transform<Type>(val1, val2, sigma, mu);
+        val1 = raft::myExp(val1);
+        val2 = raft::myExp(val2);
+      },
+      NumThreads, nBlocks, type, stream);
+  }
+
+  /**
+   * @brief Generate logistic distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param scale scale value
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void logistic(Type *ptr, LenType len, Type mu, Type scale,
+                cudaStream_t stream) {
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        constexpr Type one = (Type)1.0;
+        return mu - scale * raft::myLog(one / val - one);
+      },
+      stream);
+  }
+
+  /**
+   * @brief Generate exponentially distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param lambda the lambda
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        constexpr Type one = (Type)1.0;
+        return -raft::myLog(one - val) / lambda;
+      },
+      stream);
+  }
+
+  /**
+   * @brief Generate rayleigh distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param sigma the sigma
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        constexpr Type one = (Type)1.0;
+        constexpr Type two = (Type)2.0;
+        return raft::mySqrt(-two * raft::myLog(one - val)) * sigma;
+      },
+      stream);
+  }
+
+  /**
+   * @brief Generate laplace distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu the mean
+   * @param scale the scale
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void laplace(Type *ptr, LenType len, Type mu, Type scale,
+               cudaStream_t stream) {
+    custom_distribution(
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        constexpr Type one = (Type)1.0;
+        constexpr Type two = (Type)2.0;
+        constexpr Type oneHalf = (Type)0.5;
+        Type out;
+        if (val <= oneHalf) {
+          out = mu + scale * raft::myLog(two * val);
+        } else {
+          out = mu - scale * raft::myLog(two * (one - val));
+        }
+        return out;
+      },
+      stream);
+  }
+
+  /**
+   * @brief Sample the input array without replacement, optionally based on the
+   * input weight vector for each element in the array
+   *
+   * Implementation here is based on the `one-pass sampling` algo described here:
+   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
+   *
+   * @note In the sampled array the elements which are picked will always appear
+   * in the increasing order of their weights as computed using the exponential
+   * distribution. So, if you're particular about the order (for eg. array
+   * permutations), then this might not be the right choice!
+   *
+   * @tparam DataT data type
+   * @tparam WeightsT weights type
+   * @tparam IdxT index type
+   * @param out output sampled array (of length 'sampledLen')
+   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
+   * a nullptr if this is not required.
+   * @param in input array to be sampled (of length 'len')
+   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
+   * sampling is desired
+   * @param sampledLen output sampled array length
+   * @param len input array length
+   * @param allocator device allocator for allocating any workspace required
+   * @param stream cuda stream
+   */
+  template <typename DataT, typename WeightsT, typename IdxT = int>
+  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
+                                IdxT *outIdx, const DataT *in,
+                                const WeightsT *wts, IdxT sampledLen, IdxT len,
+                                cudaStream_t stream) {
+    ASSERT(sampledLen <= len,
+           "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
+
+    std::shared_ptr<raft::mr::device::allocator> allocator =
+      handle.get_device_allocator();
+
+    raft::mr::device::buffer<WeightsT> expWts(allocator, stream, len);
+    raft::mr::device::buffer<WeightsT> sortedWts(allocator, stream, len);
+    raft::mr::device::buffer<IdxT> inIdx(allocator, stream, len);
+    raft::mr::device::buffer<IdxT> outIdxBuff(allocator, stream, len);
+    auto *inIdxPtr = inIdx.data();
+    // generate modified weights
+    custom_distribution(
+      expWts.data(), len,
+      [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
+        inIdxPtr[idx] = idx;
+        constexpr WeightsT one = (WeightsT)1.0;
+        auto exp = -raft::myLog(one - val);
+        if (wts != nullptr) {
+          return exp / wts[idx];
+        }
+        return exp;
+      },
+      stream);
+    ///@todo: use a more efficient partitioning scheme instead of full sort
+    // sort the array and pick the top sampledLen items
+    IdxT *outIdxPtr = outIdxBuff.data();
+    raft::mr::device::buffer<char> workspace(allocator, stream);
+    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
+              (int)len, stream);
+    if (outIdx != nullptr) {
+      CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
+                                 cudaMemcpyDeviceToDevice, stream));
+    }
+    scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
+  }
+
+  /**
+   * @brief Core method to generate a pdf based on the cdf that is defined in
+   *        the input device lambda
+   *
+   * @tparam OutType  output type
+   * @tparam MathType type on which arithmetic is done
+   * @tparam LenTyp   index type
+   * @tparam Lambda   device lambda (or operator)
+   *
+   * @param[out] ptr    output buffer [on device] [len = len]
+   * @param[in]  len    number of elements to be generated
+   * @param[in]  randOp the device lambda or operator
+   * @param[in]  stream cuda stream
+   * @{
+   */
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
+                           cudaStream_t stream) {
+    randImpl<OutType, MathType, LenType, Lambda>(
+      offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
+  }
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
+                            cudaStream_t stream) {
+    rand2Impl<OutType, MathType, LenType, Lambda>(
+      offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
+  }
+  /** @} */
+
+ private:
+  /** generator type */
+  GeneratorType type;
+  /**
+   * offset is also used to initialize curand state.
+   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
+   * but is still a large period.
+   */
+  uint64_t offset;
+  /** number of blocks to launch */
+  int nBlocks;
+  /** next seed generator for device-side RNG */
+  std::mt19937_64 gen;
+
+  static const int NumThreads = 256;
+
+  template <bool IsNormal, typename Type, typename LenType>
+  uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len,
+                       int nThreads, int nBlocks) {
+    LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
+    if (IsNormal && itemsPerThread % 2 == 1) {
+      ++itemsPerThread;
+    }
+    // curand uses 2 32b uint's to generate one double
+    uint64_t factor = sizeof(Type) / sizeof(float);
+    if (factor == 0) ++factor;
+    // Check if there are enough random numbers left in sequence
+    // If not, then generate new seed and start from zero offset
+    uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
+    if (newOffset < offset) {
+      offset = 0;
+      seed = gen();
+      newOffset = itemsPerThread * factor;
+    }
+    return newOffset;
+  }
+
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp,
+                int nThreads, int nBlocks, GeneratorType type,
+                cudaStream_t stream) {
+    if (len <= 0) return;
+    uint64_t seed = gen();
+    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len,
+                                                           nThreads, nBlocks);
+    switch (type) {
+      case GenPhilox:
+        randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
+          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
+        break;
+      case GenTaps:
+        randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
+          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
+        break;
+      case GenKiss99:
+        randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
+          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
+        break;
+      default:
+        ASSERT(false, "randImpl: Incorrect generator type! %d", type);
+    };
+    CUDA_CHECK(cudaGetLastError());
+    offset = newOffset;
+  }
+
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda2>
+  void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op,
+                 int nThreads, int nBlocks, GeneratorType type,
+                 cudaStream_t stream) {
+    if (len <= 0) return;
+    auto seed = gen();
+    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len,
+                                                          nThreads, nBlocks);
+    switch (type) {
+      case GenPhilox:
+        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
+                    Lambda2>
+          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
+        break;
+      case GenTaps:
+        rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
+          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
+        break;
+      case GenKiss99:
+        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
+                    Lambda2>
+          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
+        break;
+      default:
+        ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
+    };
+    CUDA_CHECK(cudaGetLastError());
+    offset = newOffset;
+  }
+};
+
+};  // end namespace random
+};  // end namespace raft
diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh
new file mode 100644
index 0000000000..d44c6f018b
--- /dev/null
+++ b/cpp/include/raft/random/rng_impl.cuh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <curand_kernel.h>
+#include <stdint.h>
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace random {
+namespace detail {
+
+/** Philox-based random number generator */
+// Courtesy: Jakub Szuppe
+struct PhiloxGenerator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed random seed (can be same across all threads)
+   * @param subsequence as found in curand docs
+   * @param offset as found in curand docs
+   */
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    curand_init(seed, subsequence, offset, &state);
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
+  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
+  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
+  DI void next(uint64_t& ret) {
+    uint32_t a, b;
+    next(a);
+    next(b);
+    ret = (uint64_t)a | ((uint64_t)b << 32);
+  }
+  DI void next(int32_t& ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t& ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** the state for RNG */
+  curandStatePhilox4_32_10_t state;
+};
+
+/** LFSR taps-filter for generating random numbers. */
+// Courtesy: Vinay Deshpande
+struct TapsGenerator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed the seed (can be same across all threads)
+   * @param subsequence unused
+   * @param offset unused
+   */
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
+    uint64_t stride = blockDim.x * gridDim.x;
+    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
+    stride *= blockDim.y * gridDim.y;
+    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
+    state = seed + delta + 1;
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  template <typename Type>
+  DI void next(Type& ret) {
+    constexpr double ULL_LARGE = 1.8446744073709551614e19;
+    uint64_t val;
+    next(val);
+    ret = static_cast<Type>(val);
+    ret /= static_cast<Type>(ULL_LARGE);
+  }
+  DI void next(uint64_t& ret) {
+    constexpr uint64_t TAPS = 0x8000100040002000ULL;
+    constexpr int ROUNDS = 128;
+    for (int i = 0; i < ROUNDS; i++)
+      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
+    ret = state;
+  }
+  DI void next(uint32_t& ret) {
+    uint64_t val;
+    next(val);
+    ret = (uint32_t)val;
+  }
+  DI void next(int32_t& ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t& ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** the state for RNG */
+  uint64_t state;
+};
+
+/** Kiss99-based random number generator */
+
+struct Kiss99Generator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed the seed (can be same across all threads)
+   * @param subsequence unused
+   * @param offset unused
+   */
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    initKiss99(seed);
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  template <typename Type>
+  DI void next(Type& ret) {
+    constexpr double U_LARGE = 4.294967295e9;
+    uint32_t val;
+    next(val);
+    ret = static_cast<Type>(val);
+    ret /= static_cast<Type>(U_LARGE);
+  }
+  DI void next(uint32_t& ret) {
+    uint32_t MWC;
+    z = 36969 * (z & 65535) + (z >> 16);
+    w = 18000 * (w & 65535) + (w >> 16);
+    MWC = ((z << 16) + w);
+    jsr ^= (jsr << 17);
+    jsr ^= (jsr >> 13);
+    jsr ^= (jsr << 5);
+    jcong = 69069 * jcong + 1234567;
+    MWC = ((MWC ^ jcong) + jsr);
+    ret = MWC;
+  }
+  DI void next(uint64_t& ret) {
+    uint32_t a, b;
+    next(a);
+    next(b);
+    ret = (uint64_t)a | ((uint64_t)b << 32);
+  }
+  DI void next(int32_t& ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t& ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** one of the kiss99 states */
+  uint32_t z;
+  /** one of the kiss99 states */
+  uint32_t w;
+  /** one of the kiss99 states */
+  uint32_t jsr;
+  /** one of the kiss99 states */
+  uint32_t jcong;
+
+  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
+  // 128 bits. It uses 32-bit wide multiply only.
+  DI void mulByFnv1a128Prime(uint32_t* h) {
+    typedef union {
+      uint32_t u32[2];
+      uint64_t u64[1];
+    } words64;
+
+    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
+    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
+    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
+    // Partial products
+    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
+
+    h0p0.u64[0] = uint64_t(h[0]) * p0;
+    h1p0.u64[0] = uint64_t(h[1]) * p0;
+    h2p0.u64[0] = uint64_t(h[2]) * p0;
+    h0p2.u64[0] = uint64_t(h[0]) * p2;
+    h3p0.u64[0] = uint64_t(h[3]) * p0;
+    h1p2.u64[0] = uint64_t(h[1]) * p2;
+
+    // h_n[0] = LO(h[0]*p[0]);
+    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
+    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
+    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
+    uint32_t carry = 0;
+    h[0] = h0p0.u32[0];
+
+    h[1] = h0p0.u32[1] + h1p0.u32[0];
+    carry = h[1] < h0p0.u32[1] ? 1 : 0;
+
+    h[2] = h1p0.u32[1] + carry;
+    carry = h[2] < h1p0.u32[1] ? 1 : 0;
+    h[2] += h2p0.u32[0];
+    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
+    h[2] += h0p2.u32[0];
+    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
+
+    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
+    return;
+  }
+
+  DI void fnv1a128(uint32_t* hash, uint32_t txt) {
+    hash[0] ^= (txt >> 0) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 8) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 16) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 24) & 0xFF;
+    mulByFnv1a128Prime(hash);
+  }
+
+  DI void initKiss99(uint64_t seed) {
+    // Initialize hash to 128-bit FNV1a basis
+    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
+
+    // Digest threadIdx, blockIdx and seed
+    fnv1a128(hash, threadIdx.x);
+    fnv1a128(hash, threadIdx.y);
+    fnv1a128(hash, threadIdx.z);
+    fnv1a128(hash, blockIdx.x);
+    fnv1a128(hash, blockIdx.y);
+    fnv1a128(hash, blockIdx.z);
+    fnv1a128(hash, uint32_t(seed));
+    fnv1a128(hash, uint32_t(seed >> 32));
+
+    // Initialize KISS99 state with hash
+    z = hash[0];
+    w = hash[1];
+    jsr = hash[2];
+    jcong = hash[3];
+  }
+};
+
+/**
+ * @brief generator-agnostic way of generating random numbers
+ * @tparam GenType the generator object that expose 'next' method
+ */
+template <typename GenType>
+struct Generator {
+  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+    : gen(seed, subsequence, offset) {}
+
+  template <typename Type>
+  DI void next(Type& ret) {
+    gen.next(ret);
+  }
+
+ private:
+  /** the actual generator */
+  GenType gen;
+};
+
+};  // end namespace detail
+};  // end namespace random
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
new file mode 100644
index 0000000000..8691cabc85
--- /dev/null
+++ b/cpp/include/raft/stats/mean.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/eltwise.cuh>
+
+namespace raft {
+namespace stats {
+
+///@todo: ColsPerBlk has been tested only for 32!
+template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
+__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
+                                   IdxType N) {
+  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
+  IdxType thisColId = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId = threadIdx.x / ColsPerBlk;
+  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data = Type(0);
+  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  for (IdxType i = rowId; i < N; i += stride)
+    thread_data += (colId < D) ? data[i * D + colId] : Type(0);
+  __shared__ Type smu[ColsPerBlk];
+  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
+  __syncthreads();
+  raft::myAtomicAdd(smu + thisColId, thread_data);
+  __syncthreads();
+  if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]);
+}
+
+template <typename Type, typename IdxType, int TPB>
+__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
+                                   IdxType N) {
+  typedef cub::BlockReduce<Type, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  Type thread_data = Type(0);
+  IdxType colStart = N * blockIdx.x;
+  for (IdxType i = threadIdx.x; i < N; i += TPB) {
+    IdxType idx = colStart + i;
+    thread_data += data[idx];
+  }
+  Type acc = BlockReduce(temp_storage).Sum(thread_data);
+  if (threadIdx.x == 0) {
+    mu[blockIdx.x] = acc / N;
+  }
+}
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
+ * @param sample: whether to evaluate sample mean or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
+          bool rowMajor, cudaStream_t stream) {
+  static const int TPB = 256;
+  if (rowMajor) {
+    static const int RowsPerThread = 4;
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
+    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(mu, data, D, N);
+    CUDA_CHECK(cudaPeekAtLastError());
+    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
+    raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
+  } else {
+    meanKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(mu, data, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
new file mode 100644
index 0000000000..04934d4388
--- /dev/null
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Center the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-centered matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
+                IdxType N, bool rowMajor, bool bcastAlongRows,
+                cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    out, data, mu, D, N, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; }, stream);
+}
+
+/**
+ * @brief Add the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-added matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N,
+             bool rowMajor, bool bcastAlongRows, cudaStream_t stream) {
+  raft::linalg::matrixVectorOp(
+    out, data, mu, D, N, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; }, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
new file mode 100644
index 0000000000..f12c633829
--- /dev/null
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/binary_op.cuh>
+
+namespace raft {
+namespace stats {
+
+///@todo: ColPerBlk has been tested only for 32!
+template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
+__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
+                                     IdxType N) {
+  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
+  IdxType thisColId = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId = threadIdx.x / ColsPerBlk;
+  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data = Type(0);
+  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  for (IdxType i = rowId; i < N; i += stride) {
+    Type val = (colId < D) ? data[i * D + colId] : Type(0);
+    thread_data += val * val;
+  }
+  __shared__ Type sstd[ColsPerBlk];
+  if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0);
+  __syncthreads();
+  raft::myAtomicAdd(sstd + thisColId, thread_data);
+  __syncthreads();
+  if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(std + colId, sstd[thisColId]);
+}
+
+template <typename Type, typename IdxType, int TPB>
+__global__ void stddevKernelColMajor(Type *std, const Type *data,
+                                     const Type *mu, IdxType D, IdxType N) {
+  typedef cub::BlockReduce<Type, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  Type thread_data = Type(0);
+  IdxType colStart = N * blockIdx.x;
+  Type m = mu[blockIdx.x];
+  for (IdxType i = threadIdx.x; i < N; i += TPB) {
+    IdxType idx = colStart + i;
+    Type diff = data[idx] - m;
+    thread_data += diff * diff;
+  }
+  Type acc = BlockReduce(temp_storage).Sum(thread_data);
+  if (threadIdx.x == 0) {
+    std[blockIdx.x] = raft::mySqrt(acc / N);
+  }
+}
+
+template <typename Type, typename IdxType, int TPB>
+__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
+                                   IdxType D, IdxType N) {
+  typedef cub::BlockReduce<Type, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  Type thread_data = Type(0);
+  IdxType colStart = N * blockIdx.x;
+  Type m = mu[blockIdx.x];
+  for (IdxType i = threadIdx.x; i < N; i += TPB) {
+    IdxType idx = colStart + i;
+    Type diff = data[idx] - m;
+    thread_data += diff * diff;
+  }
+  Type acc = BlockReduce(temp_storage).Sum(thread_data);
+  if (threadIdx.x == 0) {
+    var[blockIdx.x] = acc / N;
+  }
+}
+
+/**
+ * @brief Compute stddev of the input matrix
+ *
+ * Stddev operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param std the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
+            bool sample, bool rowMajor, cudaStream_t stream) {
+  static const int TPB = 256;
+  if (rowMajor) {
+    static const int RowsPerThread = 4;
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D));
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(std, data, D, N);
+    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
+    raft::linalg::binaryOp(
+      std, std, mu, D,
+      [ratio] __device__(Type a, Type b) {
+        return raft::mySqrt(a * ratio - b * b);
+      },
+      stream);
+  } else {
+    stddevKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(std, data, mu, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
+          bool sample, bool rowMajor, cudaStream_t stream) {
+  static const int TPB = 256;
+  if (rowMajor) {
+    static const int RowsPerThread = 4;
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D));
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(var, data, D, N);
+    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
+    raft::linalg::binaryOp(
+      var, var, mu, D,
+      [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
+  } else {
+    varsKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(var, data, mu, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
new file mode 100644
index 0000000000..5f8416c7e2
--- /dev/null
+++ b/cpp/include/raft/stats/sum.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/eltwise.cuh>
+
+namespace raft {
+namespace stats {
+
+///@todo: ColsPerBlk has been tested only for 32!
+template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
+__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
+                                  IdxType N) {
+  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
+  IdxType thisColId = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId = threadIdx.x / ColsPerBlk;
+  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data = Type(0);
+  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  for (IdxType i = rowId; i < N; i += stride)
+    thread_data += (colId < D) ? data[i * D + colId] : Type(0);
+  __shared__ Type smu[ColsPerBlk];
+  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
+  __syncthreads();
+  raft::myAtomicAdd(smu + thisColId, thread_data);
+  __syncthreads();
+  if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]);
+}
+
+template <typename Type, typename IdxType, int TPB>
+__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
+                                  IdxType N) {
+  typedef cub::BlockReduce<Type, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  Type thread_data = Type(0);
+  IdxType colStart = N * blockIdx.x;
+  for (IdxType i = threadIdx.x; i < N; i += TPB) {
+    IdxType idx = colStart + i;
+    thread_data += data[idx];
+  }
+  Type acc = BlockReduce(temp_storage).Sum(thread_data);
+  if (threadIdx.x == 0) {
+    mu[blockIdx.x] = acc;
+  }
+}
+
+/**
+ * @brief Compute sum of the input matrix
+ *
+ * Sum operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param output the output mean vector
+ * @param input the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
+         cudaStream_t stream) {
+  static const int TPB = 256;
+  if (rowMajor) {
+    static const int RowsPerThread = 4;
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D));
+    sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(output, input, D, N);
+  } else {
+    sumKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(output, input, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
new file mode 100644
index 0000000000..1829fc0351
--- /dev/null
+++ b/cpp/include/raft/vectorized.cuh
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include "cuda_utils.cuh"
+
+namespace raft {
+
+template <typename math_, int VecLen>
+struct IOType {};
+template <>
+struct IOType<bool, 1> {
+  static_assert(sizeof(bool) == sizeof(int8_t),
+                "IOType bool size assumption failed");
+  typedef int8_t Type;
+};
+template <>
+struct IOType<bool, 2> {
+  typedef int16_t Type;
+};
+template <>
+struct IOType<bool, 4> {
+  typedef int32_t Type;
+};
+template <>
+struct IOType<bool, 8> {
+  typedef int2 Type;
+};
+template <>
+struct IOType<bool, 16> {
+  typedef int4 Type;
+};
+template <>
+struct IOType<int8_t, 1> {
+  typedef int8_t Type;
+};
+template <>
+struct IOType<int8_t, 2> {
+  typedef int16_t Type;
+};
+template <>
+struct IOType<int8_t, 4> {
+  typedef int32_t Type;
+};
+template <>
+struct IOType<int8_t, 8> {
+  typedef int2 Type;
+};
+template <>
+struct IOType<int8_t, 16> {
+  typedef int4 Type;
+};
+template <>
+struct IOType<uint8_t, 1> {
+  typedef uint8_t Type;
+};
+template <>
+struct IOType<uint8_t, 2> {
+  typedef uint16_t Type;
+};
+template <>
+struct IOType<uint8_t, 4> {
+  typedef uint32_t Type;
+};
+template <>
+struct IOType<uint8_t, 8> {
+  typedef uint2 Type;
+};
+template <>
+struct IOType<uint8_t, 16> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<int16_t, 1> {
+  typedef int16_t Type;
+};
+template <>
+struct IOType<int16_t, 2> {
+  typedef int32_t Type;
+};
+template <>
+struct IOType<int16_t, 4> {
+  typedef int2 Type;
+};
+template <>
+struct IOType<int16_t, 8> {
+  typedef int4 Type;
+};
+template <>
+struct IOType<uint16_t, 1> {
+  typedef uint16_t Type;
+};
+template <>
+struct IOType<uint16_t, 2> {
+  typedef uint32_t Type;
+};
+template <>
+struct IOType<uint16_t, 4> {
+  typedef uint2 Type;
+};
+template <>
+struct IOType<uint16_t, 8> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<__half, 1> {
+  typedef __half Type;
+};
+template <>
+struct IOType<__half, 2> {
+  typedef __half2 Type;
+};
+template <>
+struct IOType<__half, 4> {
+  typedef uint2 Type;
+};
+template <>
+struct IOType<__half, 8> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<__half2, 1> {
+  typedef __half2 Type;
+};
+template <>
+struct IOType<__half2, 2> {
+  typedef uint2 Type;
+};
+template <>
+struct IOType<__half2, 4> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<int32_t, 1> {
+  typedef int32_t Type;
+};
+template <>
+struct IOType<int32_t, 2> {
+  typedef uint2 Type;
+};
+template <>
+struct IOType<int32_t, 4> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<uint32_t, 1> {
+  typedef uint32_t Type;
+};
+template <>
+struct IOType<uint32_t, 2> {
+  typedef uint2 Type;
+};
+template <>
+struct IOType<uint32_t, 4> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<float, 1> {
+  typedef float Type;
+};
+template <>
+struct IOType<float, 2> {
+  typedef float2 Type;
+};
+template <>
+struct IOType<float, 4> {
+  typedef float4 Type;
+};
+template <>
+struct IOType<int64_t, 1> {
+  typedef int64_t Type;
+};
+template <>
+struct IOType<int64_t, 2> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<uint64_t, 1> {
+  typedef uint64_t Type;
+};
+template <>
+struct IOType<uint64_t, 2> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<unsigned long long, 1> {
+  typedef unsigned long long Type;
+};
+template <>
+struct IOType<unsigned long long, 2> {
+  typedef uint4 Type;
+};
+template <>
+struct IOType<double, 1> {
+  typedef double Type;
+};
+template <>
+struct IOType<double, 2> {
+  typedef double2 Type;
+};
+
+/**
+     * @struct TxN_t
+     *
+     * @brief Internal data structure that is used to define a facade for vectorized
+     * loads/stores across the most common POD types. The goal of his file is to
+     * provide with CUDA programmers, an easy way to have compiler issue vectorized
+     * load or store instructions to memory (either global or shared). Vectorized
+     * accesses to memory are important as they'll utilize its resources
+     * efficiently,
+     * when compared to their non-vectorized counterparts. Obviously, for whatever
+     * reasons if one is unable to issue such vectorized operations, one can always
+     * fallback to using POD types.
+     *
+     * Example demonstrating the use of load operations, performing math on such
+     * loaded data and finally storing it back.
+     * @code{.cu}
+     * TxN_t<uint8_t,8> mydata1, mydata2;
+     * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
+     * mydata1.load(ptr1, idx);
+     * mydata2.load(ptr2, idx);
+     * #pragma unroll
+     * for(int i=0;i<mydata1.Ratio;++i) {
+     *     mydata1.val.data[i] += mydata2.val.data[i];
+     * }
+     * mydata1.store(ptr1, idx);
+     * @endcode
+     *
+     * By doing as above, the interesting thing is that the code effectively remains
+     * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
+     * Only change required is to replace variable declaration appropriately.
+     *
+     * Obviously, it's caller's responsibility to take care of pointer alignment!
+     *
+     * @tparam math_ the data-type in which the compute/math needs to happen
+     * @tparam veclen_ the number of 'math_' types to be loaded/stored per
+     * instruction
+     */
+template <typename math_, int veclen_>
+struct TxN_t {
+  /** underlying math data type */
+  typedef math_ math_t;
+  /** internal storage data type */
+  typedef typename IOType<math_t, veclen_>::Type io_t;
+
+  /** defines the number of 'math_t' types stored by this struct */
+  static const int Ratio = veclen_;
+
+  union {
+    /** the vectorized data that is used for subsequent operations */
+    math_t data[Ratio];
+    /** internal data used to ensure vectorized loads/stores */
+    io_t internal;
+  } val;
+
+  ///@todo: add default constructor
+
+  /**
+   * @brief Fill the contents of this structure with a constant value
+   * @param _val the constant to be filled
+   */
+  DI void fill(math_t _val) {
+#pragma unroll
+    for (int i = 0; i < Ratio; ++i) {
+      val.data[i] = _val;
+    }
+  }
+
+  ///@todo: how to handle out-of-bounds!!?
+
+  /**
+   * @defgroup LoadsStores Global/Shared vectored loads or stores
+   *
+   * @brief Perform vectored loads/stores on this structure
+   * @tparam idx_t index data type
+   * @param ptr base pointer from where to load (or store) the data. It must
+   *  be aligned to 'sizeof(io_t)'!
+   * @param idx the offset from the base pointer which will be loaded
+   *  (or stored) by the current thread. This must be aligned to 'Ratio'!
+   *
+   * @note: In case of loads, after a successful execution, the val.data will
+   *  be populated with the desired data loaded from the pointer location. In
+   * case of stores, the data in the val.data will be stored to that location.
+   * @{
+   */
+  template <typename idx_t = int>
+  DI void load(const math_t *ptr, idx_t idx) {
+    const io_t *bptr = reinterpret_cast<const io_t *>(&ptr[idx]);
+    val.internal = __ldg(bptr);
+  }
+
+  template <typename idx_t = int>
+  DI void load(math_t *ptr, idx_t idx) {
+    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
+    val.internal = *bptr;
+  }
+
+  template <typename idx_t = int>
+  DI void store(math_t *ptr, idx_t idx) {
+    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
+    *bptr = val.internal;
+  }
+  /** @} */
+};
+
+/** this is just to keep the compiler happy! */
+template <typename math_>
+struct TxN_t<math_, 0> {
+  typedef math_ math_t;
+  static const int Ratio = 1;
+
+  union {
+    math_t data[1];
+  } val;
+
+  DI void fill(math_t _val) {}
+  template <typename idx_t = int>
+  DI void load(const math_t *ptr, idx_t idx) {}
+  template <typename idx_t = int>
+  DI void load(math_t *ptr, idx_t idx) {}
+  template <typename idx_t = int>
+  DI void store(math_t *ptr, idx_t idx) {}
+};
+
+}  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
new file mode 100644
index 0000000000..2fc9d4e30f
--- /dev/null
+++ b/cpp/test/linalg/add.cu
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/add.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "add.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename InT, typename OutT = InT>
+class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(in1, len);
+    raft::allocate(in2, len);
+    raft::allocate(out_ref, len);
+    raft::allocate(out, len);
+    r.uniform(in1, len, InT(-1.0), InT(1.0), stream);
+    r.uniform(in2, len, InT(-1.0), InT(1.0), stream);
+    naiveAddElem<InT, OutT>(out_ref, in1, in2, len);
+    add<InT, OutT>(out, in1, in2, len, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(in1));
+    CUDA_CHECK(cudaFree(in2));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void compare() {
+    ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+                                  raft::CompareApprox<OutT>(params.tolerance)));
+  }
+
+ protected:
+  AddInputs<InT, OutT> params;
+  InT *in1, *in2;
+  OutT *out_ref, *out;
+  cudaStream_t stream;
+};
+
+const std::vector<AddInputs<float>> inputsf = {
+  {0.000001f, 1024 * 1024, 1234ULL},
+  {0.000001f, 1024 * 1024 + 2, 1234ULL},
+  {0.000001f, 1024 * 1024 + 1, 1234ULL},
+};
+typedef AddTest<float> AddTestF;
+TEST_P(AddTestF, Result) { compare(); }
+INSTANTIATE_TEST_SUITE_P(AddTests, AddTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<AddInputs<double>> inputsd = {
+  {0.00000001, 1024 * 1024, 1234ULL},
+  {0.00000001, 1024 * 1024 + 2, 1234ULL},
+  {0.00000001, 1024 * 1024 + 1, 1234ULL},
+};
+typedef AddTest<double> AddTestD;
+TEST_P(AddTestD, Result) { compare(); }
+INSTANTIATE_TEST_SUITE_P(AddTests, AddTestD, ::testing::ValuesIn(inputsd));
+
+const std::vector<AddInputs<float, double>> inputsfd = {
+  {0.00000001, 1024 * 1024, 1234ULL},
+  {0.00000001, 1024 * 1024 + 2, 1234ULL},
+  {0.00000001, 1024 * 1024 + 1, 1234ULL},
+};
+typedef AddTest<float, double> AddTestFD;
+TEST_P(AddTestFD, Result) { compare(); }
+INSTANTIATE_TEST_SUITE_P(AddTests, AddTestFD, ::testing::ValuesIn(inputsfd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
new file mode 100644
index 0000000000..137419758f
--- /dev/null
+++ b/cpp/test/linalg/add.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/add.cuh>
+
+namespace raft {
+namespace linalg {
+
+template <typename InT, typename OutT = InT>
+__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2,
+                                   int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = OutT(in1[idx] + in2[idx]);
+  }
+}
+
+template <typename InT, typename OutT = InT>
+void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename InT, typename OutT = InT>
+struct AddInputs {
+  OutT tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename InT, typename OutT = InT>
+::std::ostream &operator<<(::std::ostream &os,
+                           const AddInputs<InT, OutT> &dims) {
+  return os;
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
new file mode 100644
index 0000000000..357ade7388
--- /dev/null
+++ b/cpp/test/linalg/binary_op.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "binary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename InType, typename IdxType, typename OutType>
+void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
+                    IdxType len, cudaStream_t stream) {
+  binaryOp(
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
+    stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+class BinaryOpTest
+  : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
+    raft::random::Rng r(params.seed);
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    IdxType len = params.len;
+    allocate(in1, len);
+    allocate(in2, len);
+    allocate(out_ref, len);
+    allocate(out, len);
+    r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
+    r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
+    naiveAdd(out_ref, in1, in2, len);
+    binaryOpLaunch(out, in1, in2, len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in1));
+    CUDA_CHECK(cudaFree(in2));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  BinaryOpInputs<InType, IdxType, OutType> params;
+  InType *in1, *in2;
+  OutType *out_ref, *out;
+};
+
+const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
+typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
+TEST_P(BinaryOpTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
+
+const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
+typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
+TEST_P(BinaryOpTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
+
+const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
+typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
+TEST_P(BinaryOpTestF_i32_D, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D,
+                         ::testing::ValuesIn(inputsf_i32_d));
+
+const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
+typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
+TEST_P(BinaryOpTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
+
+const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
+typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
+TEST_P(BinaryOpTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
new file mode 100644
index 0000000000..fd8ed6dd1e
--- /dev/null
+++ b/cpp/test/linalg/binary_op.cuh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename InType, typename OutType, typename IdxType>
+__global__ void naiveAddKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len) {
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
+  if (idx < len) {
+    out[idx] = static_cast<OutType>(in1[idx] + in2[idx]);
+  }
+}
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) {
+  static const IdxType TPB = 64;
+  IdxType nblks = raft::ceildiv(len, TPB);
+  naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+struct BinaryOpInputs {
+  InType tolerance;
+  IdxType len;
+  unsigned long long int seed;
+};
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+::std::ostream &operator<<(::std::ostream &os,
+                           const BinaryOpInputs<InType, IdxType, OutType> &d) {
+  return os;
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
new file mode 100644
index 0000000000..e45f5651b4
--- /dev/null
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct coalescedReductionInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os,
+                           const coalescedReductionInputs<T> &dims) {
+  return os;
+}
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename T>
+void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows,
+                              cudaStream_t stream, bool inplace = false) {
+  coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace,
+                     [] __device__(T in, int i) { return in * in; });
+}
+
+template <typename T>
+class coalescedReductionTest
+  : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    int len = rows * cols;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(data, len);
+    raft::allocate(dots_exp, rows);
+    raft::allocate(dots_act, rows);
+    r.uniform(data, len, T(-1.0), T(1.0), stream);
+    naiveCoalescedReduction(dots_exp, data, cols, rows, stream);
+
+    // Perform reduction with default inplace = false first
+    coalescedReductionLaunch(dots_act, data, cols, rows, stream);
+    // Add to result with inplace = true next
+    coalescedReductionLaunch(dots_act, data, cols, rows, stream, true);
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(dots_exp));
+    CUDA_CHECK(cudaFree(dots_act));
+  }
+
+ protected:
+  coalescedReductionInputs<T> params;
+  T *data, *dots_exp, *dots_act;
+};
+
+const std::vector<coalescedReductionInputs<float>> inputsf = {
+  {0.000002f, 1024, 32, 1234ULL},
+  {0.000002f, 1024, 64, 1234ULL},
+  {0.000002f, 1024, 128, 1234ULL},
+  {0.000002f, 1024, 256, 1234ULL}};
+
+const std::vector<coalescedReductionInputs<double>> inputsd = {
+  {0.000000001, 1024, 32, 1234ULL},
+  {0.000000001, 1024, 64, 1234ULL},
+  {0.000000001, 1024, 128, 1234ULL},
+  {0.000000001, 1024, 256, 1234ULL}};
+
+typedef coalescedReductionTest<float> coalescedReductionTestF;
+TEST_P(coalescedReductionTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef coalescedReductionTest<double> coalescedReductionTestD;
+TEST_P(coalescedReductionTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF,
+                        ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
new file mode 100644
index 0000000000..2396558939
--- /dev/null
+++ b/cpp/test/linalg/divide.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/divide.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename Type>
+__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar,
+                                  int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = in[idx] / scalar;
+  }
+}
+
+template <typename Type>
+void naiveDivide(Type *out, const Type *in, Type scalar, int len,
+                 cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveDivideKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+class DivideTest
+  : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
+ protected:
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    raft::allocate(in, len);
+    raft::allocate(out_ref, len);
+    raft::allocate(out, len);
+    r.uniform(in, len, T(-1.0), T(1.0), stream);
+    naiveDivide(out_ref, in, params.scalar, len, stream);
+    divideScalar(out, in, params.scalar, len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  UnaryOpInputs<T> params;
+  T *in, *out_ref, *out;
+};
+
+const std::vector<UnaryOpInputs<float>> inputsf = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef DivideTest<float> DivideTestF;
+TEST_P(DivideTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF,
+                         ::testing::ValuesIn(inputsf));
+
+typedef DivideTest<double> DivideTestD;
+const std::vector<UnaryOpInputs<double>> inputsd = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(DivideTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD,
+                         ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
new file mode 100644
index 0000000000..159d288174
--- /dev/null
+++ b/cpp/test/linalg/eig.cu
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/eig.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct EigInputs {
+  T tolerance;
+  int len;
+  int n_row;
+  int n_col;
+  unsigned long long int seed;
+  int n;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const EigInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
+ protected:
+  void SetUp() override {
+    raft::handle_t handle;
+    stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<EigInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+
+    raft::allocate(cov_matrix, len);
+    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
+                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    ASSERT(len == 16, "This test only works with 4x4 matrices!");
+    raft::update_device(cov_matrix, cov_matrix_h, len, stream);
+
+    raft::allocate(eig_vectors, len);
+    raft::allocate(eig_vals, params.n_col);
+    raft::allocate(eig_vectors_jacobi, len);
+    raft::allocate(eig_vals_jacobi, params.n_col);
+
+    T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
+                             0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
+                             0.4874, 0.5123,  0.5123, 0.4874};
+    T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
+
+    raft::allocate(eig_vectors_ref, len);
+    raft::allocate(eig_vals_ref, params.n_col);
+
+    raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream);
+    raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream);
+
+    eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals,
+          stream);
+
+    T tol = 1.e-7;
+    int sweeps = 15;
+    eigJacobi(handle, cov_matrix, params.n_row, params.n_col,
+              eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps);
+
+    // test code for comparing two methods
+    len = params.n * params.n;
+    raft::allocate(cov_matrix_large, len);
+    raft::allocate(eig_vectors_large, len);
+    raft::allocate(eig_vectors_jacobi_large, len);
+    raft::allocate(eig_vals_large, params.n);
+    raft::allocate(eig_vals_jacobi_large, params.n);
+
+    r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream);
+
+    eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large,
+          eig_vals_large, stream);
+    eigJacobi(handle, cov_matrix_large, params.n, params.n,
+              eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol,
+              sweeps);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(cov_matrix));
+    CUDA_CHECK(cudaFree(eig_vectors));
+    CUDA_CHECK(cudaFree(eig_vectors_jacobi));
+    CUDA_CHECK(cudaFree(eig_vals));
+    CUDA_CHECK(cudaFree(eig_vals_jacobi));
+    CUDA_CHECK(cudaFree(eig_vectors_ref));
+    CUDA_CHECK(cudaFree(eig_vals_ref));
+  }
+
+ protected:
+  EigInputs<T> params;
+  T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals,
+    *eig_vals_jacobi, *eig_vals_ref;
+
+  T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large,
+    *eig_vals_large, *eig_vals_jacobi_large;
+
+  cudaStream_t stream;
+};
+
+const std::vector<EigInputs<float>> inputsf2 = {
+  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+
+const std::vector<EigInputs<double>> inputsd2 = {
+  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+
+typedef EigTest<float> EigTestValF;
+TEST_P(EigTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigTest<double> EigTestValD;
+TEST_P(EigTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef EigTest<float> EigTestVecF;
+TEST_P(EigTestVecF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigTest<double> EigTestVecD;
+TEST_P(EigTestVecD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef EigTest<float> EigTestValJacobiF;
+TEST_P(EigTestValJacobiF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigTest<double> EigTestValJacobiD;
+TEST_P(EigTestValJacobiD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef EigTest<float> EigTestVecJacobiF;
+TEST_P(EigTestVecJacobiF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigTest<double> EigTestVecJacobiD;
+TEST_P(EigTestVecJacobiD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef EigTest<float> EigTestVecCompareF;
+TEST_P(EigTestVecCompareF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
+    raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigTest<double> EigTestVecCompareD;
+TEST_P(EigTestVecCompareD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
+    raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValD, ::testing::ValuesIn(inputsd2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD,
+                         ::testing::ValuesIn(inputsd2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD,
+                         ::testing::ValuesIn(inputsd2));
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
new file mode 100644
index 0000000000..b3980f281d
--- /dev/null
+++ b/cpp/test/linalg/eig_sel.cu
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if CUDART_VERSION >= 10010
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/eig.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct EigSelInputs {
+  T tolerance;
+  int len;
+  int n_row;
+  int n_col;
+  unsigned long long int seed;
+  int n;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const EigSelInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
+ protected:
+  void SetUp() override {
+    raft::handle_t handle;
+    stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
+    int len = params.len;
+
+    raft::allocate(cov_matrix, len);
+    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
+                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    ASSERT(len == 16, "This test only works with 4x4 matrices!");
+    raft::update_device(cov_matrix, cov_matrix_h, len, stream);
+
+    raft::allocate(eig_vectors, 12);
+    raft::allocate(eig_vals, params.n_col);
+
+    T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
+                             -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
+    T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
+
+    raft::allocate(eig_vectors_ref, 12);
+    raft::allocate(eig_vals_ref, params.n_col);
+
+    raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream);
+    raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream);
+
+    eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors,
+             eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(cov_matrix));
+    CUDA_CHECK(cudaFree(eig_vectors));
+    CUDA_CHECK(cudaFree(eig_vals));
+    CUDA_CHECK(cudaFree(eig_vectors_ref));
+    CUDA_CHECK(cudaFree(eig_vals_ref));
+  }
+
+ protected:
+  EigSelInputs<T> params;
+  T *cov_matrix, *eig_vectors, *eig_vectors_ref, *eig_vals, *eig_vals_ref;
+
+  cudaStream_t stream;
+};
+
+const std::vector<EigSelInputs<float>> inputsf2 = {
+  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+
+const std::vector<EigSelInputs<double>> inputsd2 = {
+  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+
+typedef EigSelTest<float> EigSelTestValF;
+TEST_P(EigSelTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigSelTest<double> EigSelTestValD;
+TEST_P(EigSelTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef EigSelTest<float> EigSelTestVecF;
+TEST_P(EigSelTestVecF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef EigSelTest<double> EigSelTestVecD;
+TEST_P(EigSelTestVecD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD,
+                         ::testing::ValuesIn(inputsd2));
+
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD,
+                         ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
+
+#endif
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
new file mode 100644
index 0000000000..572951c557
--- /dev/null
+++ b/cpp/test/linalg/eltwise.cu
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+//// Testing unary ops
+
+template <typename Type>
+__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar,
+                                 int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = scalar * in[idx];
+  }
+}
+
+template <typename Type>
+void naiveScale(Type *out, const Type *in, Type scalar, int len,
+                cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct ScalarMultiplyInputs {
+  T tolerance;
+  int len;
+  T scalar;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os,
+                           const ScalarMultiplyInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class ScalarMultiplyTest
+  : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    T scalar = params.scalar;
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    allocate(in, len);
+    allocate(out_ref, len);
+    allocate(out, len);
+    r.uniform(in, len, T(-1.0), T(1.0), stream);
+    naiveScale(out_ref, in, scalar, len, stream);
+    scalarMultiply(out, in, scalar, len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  ScalarMultiplyInputs<T> params;
+  T *in, *out_ref, *out;
+};
+
+const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+
+const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
+  {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
+
+typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
+TEST_P(ScalarMultiplyTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
+TEST_P(ScalarMultiplyTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF,
+                         ::testing::ValuesIn(inputsf1));
+
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD,
+                         ::testing::ValuesIn(inputsd1));
+
+//// Testing binary ops
+
+template <typename Type>
+__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2,
+                               int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = in1[idx] + in2[idx];
+  }
+}
+
+template <typename Type>
+void naiveAdd(Type *out, const Type *in1, const Type *in2, int len,
+              cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveAddKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct EltwiseAddInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os,
+                           const EltwiseAddInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    int len = params.len;
+    allocate(in1, len);
+    allocate(in2, len);
+    allocate(out_ref, len);
+    allocate(out, len);
+    r.uniform(in1, len, T(-1.0), T(1.0), stream);
+    r.uniform(in2, len, T(-1.0), T(1.0), stream);
+    naiveAdd(out_ref, in1, in2, len, stream);
+    eltwiseAdd(out, in1, in2, len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in1));
+    CUDA_CHECK(cudaFree(in2));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  EltwiseAddInputs<T> params;
+  T *in1, *in2, *out_ref, *out;
+};
+
+const std::vector<EltwiseAddInputs<float>> inputsf2 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
+
+const std::vector<EltwiseAddInputs<double>> inputsd2 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
+
+typedef EltwiseAddTest<float> EltwiseAddTestF;
+TEST_P(EltwiseAddTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef EltwiseAddTest<double> EltwiseAddTestD;
+TEST_P(EltwiseAddTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD,
+                         ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
new file mode 100644
index 0000000000..cecfc5eb8e
--- /dev/null
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct GemmLayoutInputs {
+  int M;
+  int N;
+  int K;
+  bool zLayout;
+  bool xLayout;
+  bool yLayout;
+  unsigned long long int seed;
+};
+
+// Reference GEMM implementation.
+template <typename T>
+__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
+                          bool isZColMajor, bool isXColMajor,
+                          bool isYColMajor) {
+  int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int tidy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  for (int m = tidy; m < M; m += (blockDim.y * gridDim.y)) {
+    for (int n = tidx; n < N; n += (blockDim.x * gridDim.x)) {
+      T temp = T(0.0);
+      for (int k = 0; k < K; k++) {
+        int xIndex = isXColMajor ? m + k * M : m * K + k;
+        int yIndex = isYColMajor ? k + n * K : k * N + n;
+        temp += X[xIndex] * Y[yIndex];
+      }
+      int zIndex = isZColMajor ? m + n * M : m * N + n;
+      Z[zIndex] = temp;
+    }
+  }
+}
+
+template <typename T>
+class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
+
+    raft::handle_t handle;
+    cudaStream_t stream = handle.get_stream();
+
+    raft::random::Rng r(params.seed);
+
+    // We compute Z = X * Y and compare against reference result
+    // Dimensions of X : M x K
+    // Dimensions of Y : K x N
+    // Dimensions of Z : M x N
+
+    T *X = NULL;  // Argument X
+    T *Y = NULL;  // Argument Y
+
+    size_t xElems = params.M * params.K;
+    size_t yElems = params.K * params.N;
+    size_t zElems = params.M * params.N;
+
+    CUDA_CHECK(cudaMalloc(&X, xElems * sizeof(T)));
+    CUDA_CHECK(cudaMalloc(&Y, yElems * sizeof(T)));
+    CUDA_CHECK(cudaMalloc(&refZ, zElems * sizeof(T)));
+    CUDA_CHECK(cudaMalloc(&Z, zElems * sizeof(T)));
+
+    r.uniform(X, xElems, T(-10.0), T(10.0), stream);
+    r.uniform(Y, yElems, T(-10.0), T(10.0), stream);
+
+    dim3 blocks(raft::ceildiv<int>(params.M, 128),
+                raft::ceildiv<int>(params.N, 4), 1);
+    dim3 threads(128, 4, 1);
+
+    naiveGemm<<<blocks, threads>>>(refZ, X, Y, params.M, params.N, params.K,
+                                   params.zLayout, params.xLayout,
+                                   params.yLayout);
+
+    gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout,
+         params.xLayout, params.yLayout, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(refZ));
+    CUDA_CHECK(cudaFree(Z));
+  }
+
+ protected:
+  GemmLayoutInputs<T> params;
+  T *refZ = NULL;  // Reference result for comparison
+  T *Z = NULL;     // Computed result
+};
+
+const std::vector<GemmLayoutInputs<float>> inputsf = {
+  {80, 70, 80, true, true, true, 76433ULL},
+  {80, 100, 40, true, true, false, 426646ULL},
+  {20, 100, 20, true, false, true, 237703ULL},
+  {100, 60, 30, true, false, false, 538004ULL},
+  {50, 10, 60, false, true, true, 73012ULL},
+  {90, 90, 30, false, true, false, 538147ULL},
+  {30, 100, 10, false, false, true, 412352ULL},
+  {40, 80, 100, false, false, false, 297941ULL}};
+
+const std::vector<GemmLayoutInputs<double>> inputsd = {
+  {10, 70, 40, true, true, true, 535648ULL},
+  {30, 30, 30, true, true, false, 956681ULL},
+  {70, 80, 50, true, false, true, 875083ULL},
+  {80, 90, 70, true, false, false, 50744ULL},
+  {90, 90, 30, false, true, true, 506321ULL},
+  {40, 100, 70, false, true, false, 638418ULL},
+  {80, 50, 30, false, false, true, 701529ULL},
+  {50, 80, 60, false, false, false, 893038ULL}};
+
+typedef GemmLayoutTest<float> GemmLayoutTestF;
+TEST_P(GemmLayoutTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
+                                raft::CompareApprox<float>(1e-4)));
+}
+
+typedef GemmLayoutTest<double> GemmLayoutTestD;
+TEST_P(GemmLayoutTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
+                                raft::CompareApprox<float>(1e-6)));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD,
+                         ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
new file mode 100644
index 0000000000..adbb339de2
--- /dev/null
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename Type, typename MapOp>
+__global__ void naiveMapReduceKernel(Type *out, const Type *in, size_t len,
+                                     MapOp map) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    raft::myAtomicAdd(out, map(in[idx]));
+  }
+}
+
+template <typename Type, typename MapOp>
+void naiveMapReduce(Type *out, const Type *in, size_t len, MapOp map,
+                    cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, (size_t)TPB);
+  naiveMapReduceKernel<Type, MapOp>
+    <<<nblks, TPB, 0, stream>>>(out, in, len, map);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct MapReduceInputs {
+  T tolerance;
+  size_t len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs<T> &dims) {
+  return os;
+}
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename T>
+void mapReduceLaunch(T *out_ref, T *out, const T *in, size_t len,
+                     cudaStream_t stream) {
+  auto op = [] __device__(T in) { return in; };
+  naiveMapReduce(out_ref, in, len, op, stream);
+  mapThenSumReduce(out, len, op, 0, in);
+}
+
+template <typename T>
+class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<MapReduceInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    auto len = params.len;
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    allocate(in, len);
+    allocate(out_ref, len);
+    allocate(out, len);
+    r.uniform(in, len, T(-1.0), T(1.0), stream);
+    mapReduceLaunch(out_ref, out, in, len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  MapReduceInputs<T> params;
+  T *in, *out_ref, *out;
+};
+
+const std::vector<MapReduceInputs<float>> inputsf = {
+  {0.001f, 1024 * 1024, 1234ULL}};
+typedef MapReduceTest<float> MapReduceTestF;
+TEST_P(MapReduceTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestF,
+                         ::testing::ValuesIn(inputsf));
+
+const std::vector<MapReduceInputs<double>> inputsd = {
+  {0.000001, 1024 * 1024, 1234ULL}};
+typedef MapReduceTest<double> MapReduceTestD;
+TEST_P(MapReduceTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestD,
+                         ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
new file mode 100644
index 0000000000..aa46c78b0f
--- /dev/null
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "matrix_vector_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename T, typename IdxType = int>
+struct MatVecOpInputs {
+  T tolerance;
+  IdxType rows, cols;
+  bool rowMajor, bcastAlongRows, useTwoVectors;
+  unsigned long long int seed;
+};
+
+template <typename T, typename IdxType>
+::std::ostream &operator<<(::std::ostream &os,
+                           const MatVecOpInputs<T, IdxType> &dims) {
+  return os;
+}
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename T, typename IdxType>
+void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2,
+                          IdxType D, IdxType N, bool rowMajor,
+                          bool bcastAlongRows, bool useTwoVectors,
+                          cudaStream_t stream) {
+  if (useTwoVectors) {
+    matrixVectorOp(
+      out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows,
+      [] __device__(T a, T b, T c) { return a + b + c; }, stream);
+  } else {
+    matrixVectorOp(
+      out, in, vec1, D, N, rowMajor, bcastAlongRows,
+      [] __device__(T a, T b) { return a + b; }, stream);
+  }
+}
+
+template <typename T, typename IdxType>
+class MatVecOpTest
+  : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam();
+    raft::random::Rng r(params.seed);
+    IdxType N = params.rows, D = params.cols;
+    IdxType len = N * D;
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    allocate(in, len);
+    allocate(out_ref, len);
+    allocate(out, len);
+    IdxType vecLen = params.bcastAlongRows ? D : N;
+    allocate(vec1, vecLen);
+    allocate(vec2, vecLen);
+    r.uniform(in, len, (T)-1.0, (T)1.0, stream);
+    r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream);
+    r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream);
+    if (params.useTwoVectors) {
+      naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor,
+                  params.bcastAlongRows, (T)1.0);
+    } else {
+      naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor,
+                  params.bcastAlongRows, (T)1.0);
+    }
+    matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor,
+                         params.bcastAlongRows, params.useTwoVectors, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(vec1));
+    CUDA_CHECK(cudaFree(vec2));
+    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(in));
+  }
+
+ protected:
+  MatVecOpInputs<T, IdxType> params;
+  T *in, *out, *out_ref, *vec1, *vec2;
+};
+
+const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
+  {0.00001f, 1024, 32, true, true, false, 1234ULL},
+  {0.00001f, 1024, 64, true, true, false, 1234ULL},
+  {0.00001f, 1024, 32, true, false, false, 1234ULL},
+  {0.00001f, 1024, 64, true, false, false, 1234ULL},
+  {0.00001f, 1024, 32, false, true, false, 1234ULL},
+  {0.00001f, 1024, 64, false, true, false, 1234ULL},
+  {0.00001f, 1024, 32, false, false, false, 1234ULL},
+  {0.00001f, 1024, 64, false, false, false, 1234ULL},
+
+  {0.00001f, 1024, 32, true, true, true, 1234ULL},
+  {0.00001f, 1024, 64, true, true, true, 1234ULL},
+  {0.00001f, 1024, 32, true, false, true, 1234ULL},
+  {0.00001f, 1024, 64, true, false, true, 1234ULL},
+  {0.00001f, 1024, 32, false, true, true, 1234ULL},
+  {0.00001f, 1024, 64, false, true, true, 1234ULL},
+  {0.00001f, 1024, 32, false, false, true, 1234ULL},
+  {0.00001f, 1024, 64, false, false, true, 1234ULL}};
+typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
+TEST_P(MatVecOpTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
+
+const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
+  {0.00001f, 2500, 250, false, false, false, 1234ULL},
+  {0.00001f, 2500, 250, false, false, true, 1234ULL}};
+typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
+TEST_P(MatVecOpTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
+
+const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
+  {0.0000001, 1024, 32, true, true, false, 1234ULL},
+  {0.0000001, 1024, 64, true, true, false, 1234ULL},
+  {0.0000001, 1024, 32, true, false, false, 1234ULL},
+  {0.0000001, 1024, 64, true, false, false, 1234ULL},
+  {0.0000001, 1024, 32, false, true, false, 1234ULL},
+  {0.0000001, 1024, 64, false, true, false, 1234ULL},
+  {0.0000001, 1024, 32, false, false, false, 1234ULL},
+  {0.0000001, 1024, 64, false, false, false, 1234ULL},
+
+  {0.0000001, 1024, 32, true, true, true, 1234ULL},
+  {0.0000001, 1024, 64, true, true, true, 1234ULL},
+  {0.0000001, 1024, 32, true, false, true, 1234ULL},
+  {0.0000001, 1024, 64, true, false, true, 1234ULL},
+  {0.0000001, 1024, 32, false, true, true, 1234ULL},
+  {0.0000001, 1024, 64, false, true, true, 1234ULL},
+  {0.0000001, 1024, 32, false, false, true, 1234ULL},
+  {0.0000001, 1024, 64, false, false, true, 1234ULL}};
+typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
+TEST_P(MatVecOpTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
+
+const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
+  {0.0000001, 2500, 250, false, false, false, 1234ULL},
+  {0.0000001, 2500, 250, false, false, true, 1234ULL}};
+typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
+TEST_P(MatVecOpTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
new file mode 100644
index 0000000000..69c45c9866
--- /dev/null
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename Type, typename IdxType = int>
+__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
+                                  IdxType D, IdxType N, bool rowMajor,
+                                  bool bcastAlongRows, Type scalar) {
+  IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
+  IdxType len = N * D;
+  IdxType col;
+  if (rowMajor && bcastAlongRows) {
+    col = idx % D;
+  } else if (!rowMajor && !bcastAlongRows) {
+    col = idx % N;
+  } else if (rowMajor && !bcastAlongRows) {
+    col = idx / D;
+  } else {
+    col = idx / N;
+  }
+  if (idx < len) {
+    out[idx] = mat[idx] + scalar * vec[col];
+  }
+}
+
+template <typename Type, typename IdxType = int>
+void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D,
+                 IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) {
+  static const IdxType TPB = 64;
+  IdxType len = N * D;
+  IdxType nblks = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type>
+    <<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type, typename IdxType = int>
+__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
+                                  const Type *vec2, IdxType D, IdxType N,
+                                  bool rowMajor, bool bcastAlongRows,
+                                  Type scalar) {
+  IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
+  IdxType len = N * D;
+  IdxType col;
+  if (rowMajor && bcastAlongRows) {
+    col = idx % D;
+  } else if (!rowMajor && !bcastAlongRows) {
+    col = idx % N;
+  } else if (rowMajor && !bcastAlongRows) {
+    col = idx / D;
+  } else {
+    col = idx / N;
+  }
+  if (idx < len) {
+    out[idx] = mat[idx] + scalar * vec1[col] + vec2[col];
+  }
+}
+
+template <typename Type, typename IdxType = int>
+void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2,
+                 IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows,
+                 Type scalar) {
+  static const IdxType TPB = 64;
+  IdxType len = N * D;
+  IdxType nblks = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor,
+                                          bcastAlongRows, scalar);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
new file mode 100644
index 0000000000..1d3e753de3
--- /dev/null
+++ b/cpp/test/linalg/multiply.cu
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/multiply.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    raft::allocate(in, len);
+    raft::allocate(out_ref, len);
+    raft::allocate(out, len);
+    r.uniform(in, len, T(-1.0), T(1.0), stream);
+    naiveScale(out_ref, in, params.scalar, len, stream);
+    multiplyScalar(out, in, params.scalar, len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  UnaryOpInputs<T> params;
+  T *in, *out_ref, *out;
+};
+
+const std::vector<UnaryOpInputs<float>> inputsf = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef MultiplyTest<float> MultiplyTestF;
+TEST_P(MultiplyTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF,
+                         ::testing::ValuesIn(inputsf));
+
+typedef MultiplyTest<double> MultiplyTestD;
+const std::vector<UnaryOpInputs<double>> inputsd = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(MultiplyTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD,
+                         ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
new file mode 100644
index 0000000000..acc25addd0
--- /dev/null
+++ b/cpp/test/linalg/norm.cu
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct NormInputs {
+  T tolerance;
+  int rows, cols;
+  NormType type;
+  bool do_sqrt;
+  bool rowMajor;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const NormInputs<T> &I) {
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", "
+     << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl;
+  return os;
+}
+
+///// Row-wise norm test definitions
+template <typename Type>
+__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
+                                   NormType type, bool do_sqrt) {
+  Type acc = (Type)0;
+  int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
+  if (rowStart < N) {
+    for (int i = 0; i < D; ++i) {
+      if (type == L2Norm) {
+        acc += data[rowStart * D + i] * data[rowStart * D + i];
+      } else {
+        acc += raft::myAbs(data[rowStart * D + i]);
+      }
+    }
+    dots[rowStart] = do_sqrt ? raft::mySqrt(acc) : acc;
+  }
+}
+
+template <typename Type>
+void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type,
+                  bool do_sqrt, cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(N, TPB);
+  naiveRowNormKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
+ public:
+  void SetUp() override {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(data, len);
+    raft::allocate(dots_exp, rows);
+    raft::allocate(dots_act, rows);
+    r.uniform(data, len, T(-1.0), T(1.0), stream);
+    naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
+                 stream);
+    if (params.do_sqrt) {
+      auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
+      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
+              fin_op);
+    } else {
+      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
+    }
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(dots_exp));
+    CUDA_CHECK(cudaFree(dots_act));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  NormInputs<T> params;
+  T *data, *dots_exp, *dots_act;
+  cudaStream_t stream;
+};
+
+///// Column-wise norm test definitisons
+template <typename Type>
+__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
+                                   NormType type, bool do_sqrt) {
+  int colID = threadIdx.x + blockIdx.x * blockDim.x;
+  if (colID > D) return;  //avoid out-of-bounds thread
+
+  Type acc = 0;
+  for (int i = 0; i < N; i++) {
+    Type v = data[colID + i * D];
+    acc += type == L2Norm ? v * v : raft::myAbs(v);
+  }
+
+  dots[colID] = do_sqrt ? raft::mySqrt(acc) : acc;
+}
+
+template <typename Type>
+void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type,
+                  bool do_sqrt, cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(D, TPB);
+  naiveColNormKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
+ public:
+  void SetUp() override {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(data, len);
+    r.uniform(data, len, T(-1.0), T(1.0), stream);
+    raft::allocate(dots_exp, cols);
+    raft::allocate(dots_act, cols);
+
+    naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
+                 stream);
+    if (params.do_sqrt) {
+      auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
+      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
+              fin_op);
+    } else {
+      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
+    }
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(dots_exp));
+    CUDA_CHECK(cudaFree(dots_act));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  NormInputs<T> params;
+  T *data, *dots_exp, *dots_act;
+  cudaStream_t stream;
+};
+
+///// Row- and column-wise tests
+const std::vector<NormInputs<float>> inputsf = {
+  {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
+
+  {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
+
+const std::vector<NormInputs<double>> inputsd = {
+  {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 64, L1Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 128, L1Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 256, L1Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 32, L2Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 64, L2Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 128, L2Norm, false, true, 1234ULL},
+  {0.00000001, 1024, 256, L2Norm, false, true, 1234ULL},
+
+  {0.00000001, 1024, 32, L1Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 64, L1Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 128, L1Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 256, L1Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 32, L2Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 64, L2Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 128, L2Norm, true, true, 1234ULL},
+  {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}};
+
+typedef RowNormTest<float> RowNormTestF;
+TEST_P(RowNormTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef RowNormTest<double> RowNormTestD;
+TEST_P(RowNormTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF,
+                        ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD,
+                        ::testing::ValuesIn(inputsd));
+
+const std::vector<NormInputs<float>> inputscf = {
+  {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL},
+  {0.00001f, 64, 1024, L1Norm, false, true, 1234ULL},
+  {0.00001f, 128, 1024, L1Norm, false, true, 1234ULL},
+  {0.00001f, 256, 1024, L1Norm, false, true, 1234ULL},
+  {0.00001f, 32, 1024, L2Norm, false, true, 1234ULL},
+  {0.00001f, 64, 1024, L2Norm, false, true, 1234ULL},
+  {0.00001f, 128, 1024, L2Norm, false, true, 1234ULL},
+  {0.00001f, 256, 1024, L2Norm, false, true, 1234ULL},
+
+  {0.00001f, 32, 1024, L1Norm, true, true, 1234ULL},
+  {0.00001f, 64, 1024, L1Norm, true, true, 1234ULL},
+  {0.00001f, 128, 1024, L1Norm, true, true, 1234ULL},
+  {0.00001f, 256, 1024, L1Norm, true, true, 1234ULL},
+  {0.00001f, 32, 1024, L2Norm, true, true, 1234ULL},
+  {0.00001f, 64, 1024, L2Norm, true, true, 1234ULL},
+  {0.00001f, 128, 1024, L2Norm, true, true, 1234ULL},
+  {0.00001f, 256, 1024, L2Norm, true, true, 1234ULL}};
+
+const std::vector<NormInputs<double>> inputscd = {
+  {0.00000001, 32, 1024, L1Norm, false, true, 1234ULL},
+  {0.00000001, 64, 1024, L1Norm, false, true, 1234ULL},
+  {0.00000001, 128, 1024, L1Norm, false, true, 1234ULL},
+  {0.00000001, 256, 1024, L1Norm, false, true, 1234ULL},
+  {0.00000001, 32, 1024, L2Norm, false, true, 1234ULL},
+  {0.00000001, 64, 1024, L2Norm, false, true, 1234ULL},
+  {0.00000001, 128, 1024, L2Norm, false, true, 1234ULL},
+  {0.00000001, 256, 1024, L2Norm, false, true, 1234ULL},
+
+  {0.00000001, 32, 1024, L1Norm, true, true, 1234ULL},
+  {0.00000001, 64, 1024, L1Norm, true, true, 1234ULL},
+  {0.00000001, 128, 1024, L1Norm, true, true, 1234ULL},
+  {0.00000001, 256, 1024, L1Norm, true, true, 1234ULL},
+  {0.00000001, 32, 1024, L2Norm, true, true, 1234ULL},
+  {0.00000001, 64, 1024, L2Norm, true, true, 1234ULL},
+  {0.00000001, 128, 1024, L2Norm, true, true, 1234ULL},
+  {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}};
+
+typedef ColNormTest<float> ColNormTestF;
+TEST_P(ColNormTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef ColNormTest<double> ColNormTestD;
+TEST_P(ColNormTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF,
+                        ::testing::ValuesIn(inputscf));
+
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD,
+                        ::testing::ValuesIn(inputscd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
new file mode 100644
index 0000000000..255cf1a696
--- /dev/null
+++ b/cpp/test/linalg/reduce.cu
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct ReduceInputs {
+  T tolerance;
+  int rows, cols;
+  bool rowMajor, alongRows;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const ReduceInputs<T> &dims) {
+  return os;
+}
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename T>
+void reduceLaunch(T *dots, const T *data, int cols, int rows, bool rowMajor,
+                  bool alongRows, bool inplace, cudaStream_t stream) {
+  reduce(dots, data, cols, rows, (T)0, rowMajor, alongRows, stream, inplace,
+         [] __device__(T in, int i) { return in * in; });
+}
+
+template <typename T>
+class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
+ protected:
+  void SetUp() override {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    params = ::testing::TestWithParam<ReduceInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    int len = rows * cols;
+    outlen = params.alongRows ? rows : cols;
+    raft::allocate(data, len);
+    raft::allocate(dots_exp, outlen);
+    raft::allocate(dots_act, outlen);
+    r.uniform(data, len, T(-1.0), T(1.0), stream);
+    naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
+                   params.alongRows, stream);
+
+    // Perform reduction with default inplace = false first
+    reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows,
+                 false, stream);
+    // Add to result with inplace = true next, which shouldn't affect
+    // in the case of coalescedReduction!
+    if (!(params.rowMajor ^ params.alongRows)) {
+      reduceLaunch(dots_act, data, cols, rows, params.rowMajor,
+                   params.alongRows, true, stream);
+    }
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(dots_exp));
+    CUDA_CHECK(cudaFree(dots_act));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  ReduceInputs<T> params;
+  T *data, *dots_exp, *dots_act;
+  int outlen;
+  cudaStream_t stream;
+};
+
+const std::vector<ReduceInputs<float>> inputsf = {
+  {0.000002f, 1024, 32, true, true, 1234ULL},
+  {0.000002f, 1024, 64, true, true, 1234ULL},
+  {0.000002f, 1024, 128, true, true, 1234ULL},
+  {0.000002f, 1024, 256, true, true, 1234ULL},
+  {0.000002f, 1024, 32, true, false, 1234ULL},
+  {0.000002f, 1024, 64, true, false, 1234ULL},
+  {0.000002f, 1024, 128, true, false, 1234ULL},
+  {0.000002f, 1024, 256, true, false, 1234ULL},
+  {0.000002f, 1024, 32, false, true, 1234ULL},
+  {0.000002f, 1024, 64, false, true, 1234ULL},
+  {0.000002f, 1024, 128, false, true, 1234ULL},
+  {0.000002f, 1024, 256, false, true, 1234ULL},
+  {0.000002f, 1024, 32, false, false, 1234ULL},
+  {0.000002f, 1024, 64, false, false, 1234ULL},
+  {0.000002f, 1024, 128, false, false, 1234ULL},
+  {0.000002f, 1024, 256, false, false, 1234ULL}};
+
+const std::vector<ReduceInputs<double>> inputsd = {
+  {0.000000001, 1024, 32, true, true, 1234ULL},
+  {0.000000001, 1024, 64, true, true, 1234ULL},
+  {0.000000001, 1024, 128, true, true, 1234ULL},
+  {0.000000001, 1024, 256, true, true, 1234ULL},
+  {0.000000001, 1024, 32, true, false, 1234ULL},
+  {0.000000001, 1024, 64, true, false, 1234ULL},
+  {0.000000001, 1024, 128, true, false, 1234ULL},
+  {0.000000001, 1024, 256, true, false, 1234ULL},
+  {0.000000001, 1024, 32, false, true, 1234ULL},
+  {0.000000001, 1024, 64, false, true, 1234ULL},
+  {0.000000001, 1024, 128, false, true, 1234ULL},
+  {0.000000001, 1024, 256, false, true, 1234ULL},
+  {0.000000001, 1024, 32, false, false, 1234ULL},
+  {0.000000001, 1024, 64, false, false, 1234ULL},
+  {0.000000001, 1024, 128, false, false, 1234ULL},
+  {0.000000001, 1024, 256, false, false, 1234ULL}};
+
+typedef ReduceTest<float> ReduceTestF;
+TEST_P(ReduceTestF, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+                          raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef ReduceTest<double> ReduceTestD;
+TEST_P(ReduceTestD, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
new file mode 100644
index 0000000000..18261287cf
--- /dev/null
+++ b/cpp/test/linalg/reduce.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cublas_v2.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <thrust/device_vector.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+namespace raft {
+namespace linalg {
+
+template <typename Type>
+__global__ void naiveCoalescedReductionKernel(Type *dots, const Type *data,
+                                              int D, int N) {
+  Type acc = (Type)0;
+  int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
+  if (rowStart < N) {
+    for (int i = 0; i < D; ++i) {
+      acc += data[rowStart * D + i] * data[rowStart * D + i];
+    }
+    dots[rowStart] = 2 * acc;
+  }
+}
+
+template <typename Type>
+void naiveCoalescedReduction(Type *dots, const Type *data, int D, int N,
+                             cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(N, TPB);
+  naiveCoalescedReductionKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type>
+void unaryAndGemv(Type *dots, const Type *data, int D, int N,
+                  cudaStream_t stream) {
+  //computes a MLCommon unary op on data (squares it), then computes Ax
+  //(A input matrix and x column vector) to sum columns
+  thrust::device_vector<Type> sq(D * N);
+  raft::linalg::unaryOp(
+    thrust::raw_pointer_cast(sq.data()), data, D * N,
+    [] __device__(Type v) { return v * v; }, stream);
+  cublasHandle_t handle;
+  CUBLAS_CHECK(cublasCreate(&handle));
+  thrust::device_vector<Type> ones(N, 1);  //column vector [1...1]
+  Type alpha = 1, beta = 0;
+  CUBLAS_CHECK(raft::linalg::cublasgemv(
+    handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
+    thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
+  CUDA_CHECK(cudaDeviceSynchronize());
+  CUBLAS_CHECK(cublasDestroy(handle));
+}
+
+template <typename Type>
+void naiveReduction(Type *dots, const Type *data, int D, int N, bool rowMajor,
+                    bool alongRows, cudaStream_t stream) {
+  if (rowMajor && alongRows) {
+    naiveCoalescedReduction(dots, data, D, N, stream);
+  } else if (rowMajor && !alongRows) {
+    unaryAndGemv(dots, data, D, N, stream);
+  } else if (!rowMajor && alongRows) {
+    unaryAndGemv(dots, data, N, D, stream);
+  } else {
+    naiveCoalescedReduction(dots, data, N, D, stream);
+  }
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
new file mode 100644
index 0000000000..b27fa2ac1a
--- /dev/null
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/strided_reduction.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct stridedReductionInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+void stridedReductionLaunch(T *dots, const T *data, int cols, int rows,
+                            cudaStream_t stream) {
+  stridedReduction(dots, data, cols, rows, (T)0, stream, false,
+                   [] __device__(T in, int i) { return in * in; });
+}
+
+template <typename T>
+class stridedReductionTest
+  : public ::testing::TestWithParam<stridedReductionInputs<T>> {
+ protected:
+  void SetUp() override {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    params = ::testing::TestWithParam<stridedReductionInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    int len = rows * cols;
+
+    raft::allocate(data, len);
+    raft::allocate(dots_exp, cols);  //expected dot products (from test)
+    raft::allocate(dots_act, cols);  //actual dot products (from prim)
+    r.uniform(data, len, T(-1.0), T(1.0),
+              stream);  //initialize matrix to random
+
+    unaryAndGemv(dots_exp, data, cols, rows, stream);
+    stridedReductionLaunch(dots_act, data, cols, rows, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(dots_exp));
+    CUDA_CHECK(cudaFree(dots_act));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  stridedReductionInputs<T> params;
+  T *data, *dots_exp, *dots_act;
+  cudaStream_t stream;
+};
+
+const std::vector<stridedReductionInputs<float>> inputsf = {
+  {0.00001f, 1024, 32, 1234ULL},
+  {0.00001f, 1024, 64, 1234ULL},
+  {0.00001f, 1024, 128, 1234ULL},
+  {0.00001f, 1024, 256, 1234ULL}};
+
+const std::vector<stridedReductionInputs<double>> inputsd = {
+  {0.000000001, 1024, 32, 1234ULL},
+  {0.000000001, 1024, 64, 1234ULL},
+  {0.000000001, 1024, 128, 1234ULL},
+  {0.000000001, 1024, 256, 1234ULL}};
+
+typedef stridedReductionTest<float> stridedReductionTestF;
+TEST_P(stridedReductionTestF, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
+                          raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef stridedReductionTest<double> stridedReductionTestD;
+TEST_P(stridedReductionTestD, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF,
+                        ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
new file mode 100644
index 0000000000..ced3f65fdd
--- /dev/null
+++ b/cpp/test/linalg/subtract.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/subtract.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename Type>
+__global__ void naiveSubtractElemKernel(Type *out, const Type *in1,
+                                        const Type *in2, int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = in1[idx] - in2[idx];
+  }
+}
+
+template <typename Type>
+void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len,
+                       cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveSubtractElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type>
+__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1,
+                                          const Type in2, int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = in1[idx] - in2;
+  }
+}
+
+template <typename Type>
+void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len,
+                         cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveSubtractScalarKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct SubtractInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const SubtractInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<SubtractInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(in1, len);
+    raft::allocate(in2, len);
+    raft::allocate(out_ref, len);
+    raft::allocate(out, len);
+    r.uniform(in1, len, T(-1.0), T(1.0), stream);
+    r.uniform(in2, len, T(-1.0), T(1.0), stream);
+
+    naiveSubtractElem(out_ref, in1, in2, len, stream);
+    naiveSubtractScalar(out_ref, out_ref, T(1), len, stream);
+
+    subtract(out, in1, in2, len, stream);
+    subtractScalar(out, out, T(1), len, stream);
+    subtract(in1, in1, in2, len, stream);
+    subtractScalar(in1, in1, T(1), len, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in1));
+    CUDA_CHECK(cudaFree(in2));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+ protected:
+  SubtractInputs<T> params;
+  T *in1, *in2, *out_ref, *out;
+};
+
+const std::vector<SubtractInputs<float>> inputsf2 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
+
+const std::vector<SubtractInputs<double>> inputsd2 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
+
+typedef SubtractTest<float> SubtractTestF;
+TEST_P(SubtractTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+                                raft::CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef SubtractTest<double> SubtractTestD;
+TEST_P(SubtractTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+                                raft::CompareApprox<double>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD,
+                         ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
new file mode 100644
index 0000000000..fff321768f
--- /dev/null
+++ b/cpp/test/linalg/svd.cu
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/svd.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct SvdInputs {
+  T tolerance;
+  int len;
+  int n_row;
+  int n_col;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const SvdInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
+ protected:
+  void SetUp() override {
+    raft::handle_t handle;
+
+    params = ::testing::TestWithParam<SvdInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    cudaStream_t stream = handle.get_stream();
+    raft::allocate(data, len);
+
+    ASSERT(params.n_row == 3, "This test only supports nrows=3!");
+    ASSERT(params.len == 6, "This test only supports len=6!");
+    T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
+    raft::update_device(data, data_h, len, stream);
+
+    int left_evl = params.n_row * params.n_col;
+    int right_evl = params.n_col * params.n_col;
+
+    raft::allocate(left_eig_vectors_qr, left_evl);
+    raft::allocate(right_eig_vectors_trans_qr, right_evl);
+    raft::allocate(sing_vals_qr, params.n_col);
+
+    // allocate(left_eig_vectors_jacobi, left_evl);
+    // allocate(right_eig_vectors_trans_jacobi, right_evl);
+    // allocate(sing_vals_jacobi, params.n_col);
+
+    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695,
+                                  0.488195,  0.110706,  -0.865685};
+
+    T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636};
+
+    T sing_vals_ref_h[] = {7.065283, 1.040081};
+
+    raft::allocate(left_eig_vectors_ref, left_evl);
+    raft::allocate(right_eig_vectors_ref, right_evl);
+    raft::allocate(sing_vals_ref, params.n_col);
+
+    raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl,
+                        stream);
+    raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h,
+                        right_evl, stream);
+    raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream);
+
+    svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr,
+          left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true,
+          stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(left_eig_vectors_qr));
+    CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr));
+    CUDA_CHECK(cudaFree(sing_vals_qr));
+    CUDA_CHECK(cudaFree(left_eig_vectors_ref));
+    CUDA_CHECK(cudaFree(right_eig_vectors_ref));
+    CUDA_CHECK(cudaFree(sing_vals_ref));
+  }
+
+ protected:
+  SvdInputs<T> params;
+  T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr,
+    *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref;
+};
+
+const std::vector<SvdInputs<float>> inputsf2 = {
+  {0.00001f, 3 * 2, 3, 2, 1234ULL}};
+
+const std::vector<SvdInputs<double>> inputsd2 = {
+  {0.00001, 3 * 2, 3, 2, 1234ULL}};
+
+typedef SvdTest<float> SvdTestValF;
+TEST_P(SvdTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef SvdTest<double> SvdTestValD;
+TEST_P(SvdTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef SvdTest<float> SvdTestLeftVecF;
+TEST_P(SvdTestLeftVecF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
+    raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef SvdTest<double> SvdTestLeftVecD;
+TEST_P(SvdTestLeftVecD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
+    raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef SvdTest<float> SvdTestRightVecF;
+TEST_P(SvdTestRightVecF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
+                      params.n_col * params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef SvdTest<double> SvdTestRightVecD;
+TEST_P(SvdTestRightVecD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
+                      params.n_col * params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2));
+
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD,
+                         ::testing::ValuesIn(inputsd2));
+
+// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
+// ::testing::ValuesIn(inputsf2));
+
+// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD,
+//::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
new file mode 100644
index 0000000000..f10b029962
--- /dev/null
+++ b/cpp/test/linalg/transpose.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/transpose.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct TranposeInputs {
+  T tolerance;
+  int len;
+  int n_row;
+  int n_col;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const TranposeInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<TranposeInputs<T>>::GetParam();
+
+    stream = handle.get_stream();
+
+    int len = params.len;
+
+    raft::allocate(data, len);
+    ASSERT(params.len == 9, "This test works only with len=9!");
+    T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+    raft::update_device(data, data_h, len, stream);
+
+    raft::allocate(data_trans_ref, len);
+    T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
+    raft::update_device(data_trans_ref, data_ref_h, len, stream);
+
+    raft::allocate(data_trans, len);
+
+    transpose(handle, data, data_trans, params.n_row, params.n_col, stream);
+    transpose(data, params.n_row, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(data_trans));
+    CUDA_CHECK(cudaFree(data_trans_ref));
+  }
+
+ protected:
+  TranposeInputs<T> params;
+  T *data, *data_trans, *data_trans_ref;
+  raft::handle_t handle;
+  cudaStream_t stream;
+};
+
+const std::vector<TranposeInputs<float>> inputsf2 = {
+  {0.1f, 3 * 3, 3, 3, 1234ULL}};
+
+const std::vector<TranposeInputs<double>> inputsd2 = {
+  {0.1, 3 * 3, 3, 3, 1234ULL}};
+
+typedef TransposeTest<float> TransposeTestValF;
+TEST_P(TransposeTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data_trans, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef TransposeTest<double> TransposeTestValD;
+TEST_P(TransposeTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data_trans, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD,
+                         ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
new file mode 100644
index 0000000000..666ab8619d
--- /dev/null
+++ b/cpp/test/linalg/unary_op.cu
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+#include "unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename InType, typename IdxType = int, typename OutType = InType>
+void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
+                   cudaStream_t stream) {
+  if (in == nullptr) {
+    auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
+      *ptr = static_cast<OutType>(scalar * idx);
+    };
+    writeOnlyUnaryOp<OutType, decltype(op), IdxType>(out, len, op, stream);
+  } else {
+    auto op = [scalar] __device__(InType in) {
+      return static_cast<OutType>(in * scalar);
+    };
+    unaryOp<InType, decltype(op), IdxType, OutType>(out, in, len, op, stream);
+  }
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+class UnaryOpTest
+  : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
+    raft::random::Rng r(params.seed);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    auto len = params.len;
+    allocate(in, len);
+    allocate(out_ref, len);
+    allocate(out, len);
+    r.uniform(in, len, InType(-1.0), InType(1.0), stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaFree(in));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(out));
+  }
+
+  virtual void DoTest() {
+    auto len = params.len;
+    auto scalar = params.scalar;
+    naiveScale(out_ref, in, scalar, len, stream);
+    unaryOpLaunch(out, in, scalar, len, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                            CompareApprox<OutType>(params.tolerance)));
+  }
+
+  UnaryOpInputs<InType, IdxType, OutType> params;
+  InType *in;
+  OutType *out_ref, *out;
+  cudaStream_t stream;
+};
+
+template <typename OutType, typename IdxType>
+class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
+ protected:
+  void DoTest() override {
+    auto len = this->params.len;
+    auto scalar = this->params.scalar;
+    naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream);
+    unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream);
+    CUDA_CHECK(cudaStreamSynchronize(this->stream));
+    ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len,
+                            CompareApprox<OutType>(this->params.tolerance)));
+  }
+};
+
+#define UNARY_OP_TEST(Name, inputs)  \
+  TEST_P(Name, Result) { DoTest(); } \
+  INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
+
+const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
+UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
+typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
+UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i32, inputsf_i32);
+
+const std::vector<UnaryOpInputs<float, size_t>> inputsf_i64 = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef UnaryOpTest<float, size_t> UnaryOpTestF_i64;
+UNARY_OP_TEST(UnaryOpTestF_i64, inputsf_i64);
+typedef WriteOnlyUnaryOpTest<float, size_t> WriteOnlyUnaryOpTestF_i64;
+UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i64, inputsf_i64);
+
+const std::vector<UnaryOpInputs<float, int, double>> inputsf_i32_d = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef UnaryOpTest<float, int, double> UnaryOpTestF_i32_D;
+UNARY_OP_TEST(UnaryOpTestF_i32_D, inputsf_i32_d);
+
+const std::vector<UnaryOpInputs<double, int>> inputsd_i32 = {
+  {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
+typedef UnaryOpTest<double, int> UnaryOpTestD_i32;
+UNARY_OP_TEST(UnaryOpTestD_i32, inputsd_i32);
+typedef WriteOnlyUnaryOpTest<double, int> WriteOnlyUnaryOpTestD_i32;
+UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i32, inputsd_i32);
+
+const std::vector<UnaryOpInputs<double, size_t>> inputsd_i64 = {
+  {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
+typedef UnaryOpTest<double, size_t> UnaryOpTestD_i64;
+UNARY_OP_TEST(UnaryOpTestD_i64, inputsd_i64);
+typedef WriteOnlyUnaryOpTest<double, size_t> WriteOnlyUnaryOpTestD_i64;
+UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i64, inputsd_i64);
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
new file mode 100644
index 0000000000..be3f1124c5
--- /dev/null
+++ b/cpp/test/linalg/unary_op.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename InType, typename OutType, typename IdxType>
+__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
+                                 IdxType len) {
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
+  if (idx < len) {
+    if (in == nullptr) {
+      // used for testing writeOnlyUnaryOp
+      out[idx] = static_cast<OutType>(scalar * idx);
+    } else {
+      out[idx] = static_cast<OutType>(scalar * in[idx]);
+    }
+  }
+}
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+void naiveScale(OutType *out, const InType *in, InType scalar, int len,
+                cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  naiveScaleKernel<InType, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+struct UnaryOpInputs {
+  OutType tolerance;
+  IdxType len;
+  InType scalar;
+  unsigned long long int seed;
+};
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+::std::ostream &operator<<(::std::ostream &os,
+                           const UnaryOpInputs<InType, IdxType, OutType> &d) {
+  return os;
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
new file mode 100644
index 0000000000..578139623a
--- /dev/null
+++ b/cpp/test/matrix/math.cu
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace matrix {
+
+template <typename Type>
+__global__ void nativePowerKernel(Type *in, Type *out, int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = in[idx] * in[idx];
+  }
+}
+
+template <typename Type>
+void naivePower(Type *in, Type *out, int len, cudaStream_t stream) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type>
+__global__ void nativeSqrtKernel(Type *in, Type *out, int len) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) {
+    out[idx] = sqrt(in[idx]);
+  }
+}
+
+template <typename Type>
+void naiveSqrt(Type *in, Type *out, int len) {
+  static const int TPB = 64;
+  int nblks = raft::ceildiv(len, TPB);
+  nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type>
+__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
+                                    int colCount) {
+  int d_i = blockIdx.x * rowCount;
+  int end = d_i + rowCount;
+
+  if (blockIdx.x < colCount) {
+    Type max = 0.0;
+    int max_index = 0;
+    for (int i = d_i; i < end; i++) {
+      Type val = in[i];
+      if (val < 0.0) {
+        val = -val;
+      }
+      if (val > max) {
+        max = val;
+        max_index = i;
+      }
+    }
+
+    for (int i = d_i; i < end; i++) {
+      if (in[max_index] < 0.0) {
+        out[i] = -in[i];
+      } else {
+        out[i] = in[i];
+      }
+    }
+  }
+
+  __syncthreads();
+}
+
+template <typename Type>
+void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) {
+  naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct MathInputs {
+  T tolerance;
+  int n_row;
+  int n_col;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const MathInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<MathInputs<T>>::GetParam();
+    random::Rng r(params.seed);
+    int len = params.len;
+
+    allocate(in_power, len);
+    allocate(out_power_ref, len);
+    allocate(in_sqrt, len);
+    allocate(out_sqrt_ref, len);
+    allocate(in_sign_flip, len);
+    allocate(out_sign_flip_ref, len);
+
+    raft::handle_t handle;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    allocate(in_ratio, 4);
+    T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0};
+    update_device(in_ratio, in_ratio_h, 4, stream);
+
+    allocate(out_ratio_ref, 4);
+    T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375};
+    update_device(out_ratio_ref, out_ratio_ref_h, 4, stream);
+
+    r.uniform(in_power, len, T(-1.0), T(1.0), stream);
+    r.uniform(in_sqrt, len, T(0.0), T(1.0), stream);
+    // r.uniform(in_ratio, len, T(0.0), T(1.0));
+    r.uniform(in_sign_flip, len, T(-100.0), T(100.0), stream);
+
+    naivePower(in_power, out_power_ref, len, stream);
+    power(in_power, len, stream);
+
+    naiveSqrt(in_sqrt, out_sqrt_ref, len);
+    seqRoot(in_sqrt, len, stream);
+
+    ratio(handle, in_ratio, in_ratio, 4, stream);
+
+    naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col);
+    signFlip(in_sign_flip, params.n_row, params.n_col, stream);
+
+    allocate(in_recip, 4);
+    allocate(in_recip_ref, 4);
+    allocate(out_recip, 4);
+    // default threshold is 1e-15
+    std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
+    std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
+    update_device(in_recip, in_recip_h.data(), 4, stream);
+    update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream);
+    T recip_scalar = T(1.0);
+
+    // this `reciprocal()` has to go first bc next one modifies its input
+    reciprocal(in_recip, out_recip, recip_scalar, 4, stream);
+
+    reciprocal(in_recip, recip_scalar, 4, stream, true);
+
+    std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
+    std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
+    allocate(in_smallzero, 4);
+    allocate(out_smallzero, 4);
+    allocate(out_smallzero_ref, 4);
+    update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream);
+    update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream);
+    setSmallValuesZero(out_smallzero, in_smallzero, 4, stream);
+    setSmallValuesZero(in_smallzero, 4, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in_power));
+    CUDA_CHECK(cudaFree(out_power_ref));
+    CUDA_CHECK(cudaFree(in_sqrt));
+    CUDA_CHECK(cudaFree(out_sqrt_ref));
+    CUDA_CHECK(cudaFree(in_ratio));
+    CUDA_CHECK(cudaFree(out_ratio_ref));
+    CUDA_CHECK(cudaFree(in_sign_flip));
+    CUDA_CHECK(cudaFree(out_sign_flip_ref));
+    CUDA_CHECK(cudaFree(in_recip));
+    CUDA_CHECK(cudaFree(in_recip_ref));
+    CUDA_CHECK(cudaFree(out_recip));
+    CUDA_CHECK(cudaFree(in_smallzero));
+    CUDA_CHECK(cudaFree(out_smallzero));
+    CUDA_CHECK(cudaFree(out_smallzero_ref));
+  }
+
+ protected:
+  MathInputs<T> params;
+  T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio,
+    *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref,
+    *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref;
+};
+
+const std::vector<MathInputs<float>> inputsf = {
+  {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
+
+const std::vector<MathInputs<double>> inputsd = {
+  {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
+
+typedef MathTest<float> MathPowerTestF;
+TEST_P(MathPowerTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MathTest<double> MathPowerTestD;
+TEST_P(MathPowerTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+typedef MathTest<float> MathSqrtTestF;
+TEST_P(MathSqrtTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MathTest<double> MathSqrtTestD;
+TEST_P(MathSqrtTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+typedef MathTest<float> MathRatioTestF;
+TEST_P(MathRatioTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MathTest<double> MathRatioTestD;
+TEST_P(MathRatioTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+typedef MathTest<float> MathSignFlipTestF;
+TEST_P(MathSignFlipTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MathTest<double> MathSignFlipTestD;
+TEST_P(MathSignFlipTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+typedef MathTest<float> MathReciprocalTestF;
+TEST_P(MathReciprocalTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
+
+  // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
+  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MathTest<double> MathReciprocalTestD;
+TEST_P(MathReciprocalTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
+
+  // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
+  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+typedef MathTest<float> MathSetSmallZeroTestF;
+TEST_P(MathSetSmallZeroTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MathTest<double> MathSetSmallZeroTestD;
+TEST_P(MathSetSmallZeroTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
+
+  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD,
+                         ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD,
+                         ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD,
+                         ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD,
+                         ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD,
+                         ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF,
+                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD,
+                         ::testing::ValuesIn(inputsd));
+
+}  // namespace matrix
+}  // namespace raft
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
new file mode 100644
index 0000000000..499d24ed41
--- /dev/null
+++ b/cpp/test/matrix/matrix.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct MatrixInputs {
+  T tolerance;
+  int n_row;
+  int n_col;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const MatrixInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<MatrixInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.n_row * params.n_col;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(in1, len);
+    raft::allocate(in2, len);
+    raft::allocate(in1_revr, len);
+    r.uniform(in1, len, T(-1.0), T(1.0), stream);
+
+    copy(in1, in2, params.n_row, params.n_col, stream);
+    // copy(in1, in1_revr, params.n_row, params.n_col);
+    // colReverse(in1_revr, params.n_row, params.n_col);
+
+    T *outTrunc;
+    raft::allocate(outTrunc, 6);
+    truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(in1));
+    CUDA_CHECK(cudaFree(in2));
+    // CUDA_CHECK(cudaFree(in1_revr));
+  }
+
+ protected:
+  MatrixInputs<T> params;
+  T *in1, *in2, *in1_revr;
+};
+
+const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
+
+const std::vector<MatrixInputs<double>> inputsd2 = {
+  {0.00000001, 4, 4, 1234ULL}};
+
+typedef MatrixTest<float> MatrixTestF;
+TEST_P(MatrixTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef MatrixTest<double> MatrixTestD;
+TEST_P(MatrixTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF,
+                         ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD,
+                         ::testing::ValuesIn(inputsd2));
+
+}  // namespace matrix
+}  // namespace raft
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
new file mode 100644
index 0000000000..af10dcab30
--- /dev/null
+++ b/cpp/test/random/rng.cu
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/stddev.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace random {
+
+enum RandomType {
+  RNG_Normal,
+  RNG_LogNormal,
+  RNG_Uniform,
+  RNG_Gumbel,
+  RNG_Logistic,
+  RNG_Exp,
+  RNG_Rayleigh,
+  RNG_Laplace
+};
+
+template <typename T, int TPB>
+__global__ void meanKernel(T* out, const T* data, int len) {
+  typedef cub::BlockReduce<T, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  T val = tid < len ? data[tid] : T(0);
+  T x = BlockReduce(temp_storage).Sum(val);
+  __syncthreads();
+  T xx = BlockReduce(temp_storage).Sum(val * val);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    raft::myAtomicAdd(out, x);
+    raft::myAtomicAdd(out + 1, xx);
+  }
+}
+
+template <typename T>
+struct RngInputs {
+  T tolerance;
+  int len;
+  // start, end: for uniform
+  // mean, sigma: for normal/lognormal
+  // mean, beta: for gumbel
+  // mean, scale: for logistic and laplace
+  // lambda: for exponential
+  // sigma: for rayleigh
+  T start, end;
+  RandomType type;
+  GeneratorType gtype;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims) {
+  return os;
+}
+
+#include <sys/timeb.h>
+#include <time.h>
+
+template <typename T>
+class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
+ protected:
+  void SetUp() override {
+    // Tests are configured with their expected test-values sigma. For example,
+    // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
+    num_sigma = 10;
+    params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    Rng r(params.seed, params.gtype);
+    allocate(data, params.len);
+    allocate(stats, 2, true);
+    switch (params.type) {
+      case RNG_Normal:
+        r.normal(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_LogNormal:
+        r.lognormal(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Uniform:
+        r.uniform(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Gumbel:
+        r.gumbel(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Logistic:
+        r.logistic(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Exp:
+        r.exponential(data, params.len, params.start, stream);
+        break;
+      case RNG_Rayleigh:
+        r.rayleigh(data, params.len, params.start, stream);
+        break;
+      case RNG_Laplace:
+        r.laplace(data, params.len, params.start, params.end, stream);
+        break;
+    };
+    static const int threads = 128;
+    meanKernel<T, threads>
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
+                                                                   params.len);
+    update_host<T>(h_stats, stats, 2, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    h_stats[0] /= params.len;
+    h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(stats));
+  }
+
+  void getExpectedMeanVar(T meanvar[2]) {
+    switch (params.type) {
+      case RNG_Normal:
+        meanvar[0] = params.start;
+        meanvar[1] = params.end * params.end;
+        break;
+      case RNG_LogNormal: {
+        auto var = params.end * params.end;
+        auto mu = params.start;
+        meanvar[0] = raft::myExp(mu + var * T(0.5));
+        meanvar[1] =
+          (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        break;
+      }
+      case RNG_Uniform:
+        meanvar[0] = (params.start + params.end) * T(0.5);
+        meanvar[1] = params.end - params.start;
+        meanvar[1] = meanvar[1] * meanvar[1] / T(12.0);
+        break;
+      case RNG_Gumbel: {
+        auto gamma = T(0.577215664901532);
+        meanvar[0] = params.start + params.end * gamma;
+        meanvar[1] = T(3.1415) * T(3.1415) * params.end * params.end / T(6.0);
+        break;
+      }
+      case RNG_Logistic:
+        meanvar[0] = params.start;
+        meanvar[1] = T(3.1415) * T(3.1415) * params.end * params.end / T(3.0);
+        break;
+      case RNG_Exp:
+        meanvar[0] = T(1.0) / params.start;
+        meanvar[1] = meanvar[0] * meanvar[0];
+        break;
+      case RNG_Rayleigh:
+        meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
+        meanvar[1] =
+          ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
+        break;
+      case RNG_Laplace:
+        meanvar[0] = params.start;
+        meanvar[1] = T(2.0) * params.end * params.end;
+        break;
+    };
+  }
+
+ protected:
+  RngInputs<T> params;
+  T *data, *stats;
+  T h_stats[2];  // mean, var
+  int num_sigma;
+};
+
+// The measured mean and standard deviation for each tested distribution are,
+// of course, statistical variables. Thus setting an appropriate testing
+// tolerance essentially requires one to set a probability of test failure. We
+// choose to set this at 3-4 x sigma, i.e., a 99.7-99.9% confidence interval so that
+// the test will indeed pass. In quick experiments (using the identical
+// distributions given by NumPy/SciPy), the measured standard deviation is the
+// variable with the greatest variance and so we determined the variance for
+// each distribution and number of samples (32*1024 or 8*1024). Below
+// are listed the standard deviation for these tests.
+
+// Distribution: StdDev 32*1024, StdDev 8*1024
+// Normal: 0.0055, 0.011
+// LogNormal: 0.05, 0.1
+// Uniform: 0.003, 0.005
+// Gumbel: 0.005, 0.01
+// Logistic: 0.005, 0.01
+// Exp: 0.008, 0.015
+// Rayleigh: 0.0125, 0.025
+// Laplace: 0.02, 0.04
+
+// We generally want 4 x sigma >= 99.9% chance of success
+
+typedef RngTest<float> RngTestF;
+const std::vector<RngInputs<float>> inputsf = {
+  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenPhilox, 1234ULL},
+  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenPhilox, 1234ULL},
+  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenPhilox, 1234ULL},
+  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenPhilox, 1234ULL},
+  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
+  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
+  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPhilox, 1234ULL},
+  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPhilox, 1234ULL},
+  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
+  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
+
+  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL},
+  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL},
+  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL},
+  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL},
+  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL},
+  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL},
+  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL},
+  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL},
+  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL},
+  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL},
+  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL},
+  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL},
+
+  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL},
+  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL},
+  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL},
+  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL},
+  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL},
+  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL},
+  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL},
+  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL},
+  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL},
+  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
+  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
+
+TEST_P(RngTestF, Result) {
+  float meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf));
+
+typedef RngTest<double> RngTestD;
+const std::vector<RngInputs<double>> inputsd = {
+  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenPhilox, 1234ULL},
+  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenPhilox, 1234ULL},
+  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenPhilox, 1234ULL},
+  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenPhilox, 1234ULL},
+  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
+  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
+  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPhilox, 1234ULL},
+  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPhilox, 1234ULL},
+  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
+  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
+
+  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL},
+  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL},
+  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL},
+  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL},
+  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL},
+  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL},
+  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL},
+  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL},
+  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL},
+  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL},
+  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL},
+  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL},
+
+  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL},
+  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL},
+  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL},
+  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL},
+  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL},
+  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL},
+  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL},
+  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
+  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
+  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
+  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
+TEST_P(RngTestD, Result) {
+  double meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
+
+// ---------------------------------------------------------------------- //
+// Test for expected variance in mean calculations
+
+template <typename T>
+T quick_mean(const std::vector<T>& d) {
+  T acc = T(0);
+  for (const auto& di : d) {
+    acc += di;
+  }
+  return acc / d.size();
+}
+
+template <typename T>
+T quick_std(const std::vector<T>& d) {
+  T acc = T(0);
+  T d_mean = quick_mean(d);
+  for (const auto& di : d) {
+    acc += ((di - d_mean) * (di - d_mean));
+  }
+  return std::sqrt(acc / (d.size() - 1));
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+  if (!v.empty()) {
+    out << '[';
+    std::copy(v.begin(), v.end(), std::ostream_iterator<T>(out, ", "));
+    out << "\b\b]";
+  }
+  return out;
+}
+
+// The following tests the 3 random number generators by checking that the
+// measured mean error is close to the well-known analytical result
+// (sigma/sqrt(n_samples)). To compute the mean error, we a number of
+// experiments computing the mean, giving us a distribution of the mean
+// itself. The mean error is simply the standard deviation of this
+// distribution (the standard deviation of the mean).
+TEST(Rng, MeanError) {
+  timeb time_struct;
+  ftime(&time_struct);
+  int seed = time_struct.millitm;
+  int num_samples = 1024;
+  int num_experiments = 1024;
+  float* data;
+  float* mean_result;
+  float* std_result;
+  int len = num_samples * num_experiments;
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  allocate(data, len);
+  allocate(mean_result, num_experiments);
+  allocate(std_result, num_experiments);
+
+  for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) {
+    Rng r(seed, rtype);
+    r.normal(data, len, 3.3f, 0.23f, stream);
+    // r.uniform(data, len, -1.0, 2.0);
+    raft::stats::mean(mean_result, data, num_samples, num_experiments, false,
+                      false, stream);
+    raft::stats::stddev(std_result, data, mean_result, num_samples,
+                        num_experiments, false, false, stream);
+    std::vector<float> h_mean_result(num_experiments);
+    std::vector<float> h_std_result(num_experiments);
+    update_host(h_mean_result.data(), mean_result, num_experiments, stream);
+    update_host(h_std_result.data(), std_result, num_experiments, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    auto d_mean = quick_mean(h_mean_result);
+
+    // std-dev of mean; also known as mean error
+    auto d_std_of_mean = quick_std(h_mean_result);
+    auto d_std = quick_mean(h_std_result);
+    auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples);
+
+    // std::cout << "measured mean error: " << d_std_of_mean << "\n";
+    // std::cout << "expected mean error: " << d_std/std::sqrt(num_samples) << "\n";
+
+    auto diff_expected_vs_measured_mean_error =
+      std::abs(d_std_of_mean - d_std / std::sqrt(num_samples));
+
+    ASSERT_TRUE(
+      (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
+  }
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaFree(data));
+  CUDA_CHECK(cudaFree(mean_result));
+  CUDA_CHECK(cudaFree(std_result));
+
+  // std::cout << "mean_res:" << h_mean_result << "\n";
+}
+
+template <typename T, int len, int scale>
+class ScaledBernoulliTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    Rng r(42);
+
+    allocate(data, len * sizeof(T), stream);
+    r.scaled_bernoulli(data, len, T(0.5), T(scale), stream);
+  }
+
+  void TearDown() override { CUDA_CHECK(cudaFree(data)); }
+
+  void rangeCheck() {
+    T* h_data = new T[len];
+    update_host(h_data, data, len, stream);
+    ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) {
+      return a < -scale || a > scale;
+    }));
+    delete[] h_data;
+  }
+
+  T* data;
+  cudaStream_t stream;
+};
+
+typedef ScaledBernoulliTest<float, 500, 35> ScaledBernoulliTest1;
+TEST_F(ScaledBernoulliTest1, RangeCheck) { rangeCheck(); }
+
+typedef ScaledBernoulliTest<double, 100, 220> ScaledBernoulliTest2;
+TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); }
+
+template <typename T, int len>
+class BernoulliTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    Rng r(42);
+    allocate(data, len * sizeof(bool), stream);
+    r.bernoulli(data, len, T(0.5), stream);
+  }
+
+  void TearDown() override { CUDA_CHECK(cudaFree(data)); }
+
+  void trueFalseCheck() {
+    // both true and false values must be present
+    bool* h_data = new bool[len];
+    update_host(h_data, data, len, stream);
+    ASSERT_TRUE(std::any_of(h_data, h_data + len, [](bool a) { return a; }));
+    ASSERT_TRUE(std::any_of(h_data, h_data + len, [](bool a) { return !a; }));
+    delete[] h_data;
+  }
+
+  bool* data;
+  cudaStream_t stream;
+};
+
+typedef BernoulliTest<float, 1000> BernoulliTest1;
+TEST_F(BernoulliTest1, TrueFalseCheck) { trueFalseCheck(); }
+
+typedef BernoulliTest<double, 1000> BernoulliTest2;
+TEST_F(BernoulliTest2, TrueFalseCheck) { trueFalseCheck(); }
+
+/** Rng::normalTable tests */
+template <typename T>
+struct RngNormalTableInputs {
+  T tolerance;
+  int rows, cols;
+  T mu, sigma;
+  GeneratorType gtype;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os,
+                           const RngNormalTableInputs<T>& dims) {
+  return os;
+}
+
+template <typename T>
+class RngNormalTableTest
+  : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
+ protected:
+  void SetUp() override {
+    // Tests are configured with their expected test-values sigma. For example,
+    // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
+    num_sigma = 10;
+    params = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
+    int len = params.rows * params.cols;
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    Rng r(params.seed, params.gtype);
+    allocate(data, len);
+    allocate(stats, 2, true);
+    allocate(mu_vec, params.cols);
+    r.fill(mu_vec, params.cols, params.mu, stream);
+    T* sigma_vec = nullptr;
+    r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec,
+                  params.sigma, stream);
+    static const int threads = 128;
+    meanKernel<T, threads>
+      <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats, data, len);
+    update_host<T>(h_stats, stats, 2, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    h_stats[0] /= len;
+    h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(stats));
+    CUDA_CHECK(cudaFree(mu_vec));
+  }
+
+  void getExpectedMeanVar(T meanvar[2]) {
+    meanvar[0] = params.mu;
+    meanvar[1] = params.sigma * params.sigma;
+  }
+
+ protected:
+  RngNormalTableInputs<T> params;
+  T *data, *stats, *mu_vec;
+  T h_stats[2];  // mean, var
+  int num_sigma;
+};
+
+typedef RngNormalTableTest<float> RngNormalTableTestF;
+const std::vector<RngNormalTableInputs<float>> inputsf_t = {
+  {0.0055, 32, 1024, 1.f, 1.f, GenPhilox, 1234ULL},
+  {0.011, 8, 1024, 1.f, 1.f, GenPhilox, 1234ULL},
+  {0.0055, 32, 1024, 1.f, 1.f, GenTaps, 1234ULL},
+  {0.011, 8, 1024, 1.f, 1.f, GenTaps, 1234ULL},
+  {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
+  {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
+
+TEST_P(RngNormalTableTestF, Result) {
+  float meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF,
+                         ::testing::ValuesIn(inputsf_t));
+
+typedef RngNormalTableTest<double> RngNormalTableTestD;
+const std::vector<RngNormalTableInputs<double>> inputsd_t = {
+  {0.0055, 32, 1024, 1.0, 1.0, GenPhilox, 1234ULL},
+  {0.011, 8, 1024, 1.0, 1.0, GenPhilox, 1234ULL},
+  {0.0055, 32, 1024, 1.0, 1.0, GenTaps, 1234ULL},
+  {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
+  {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
+  {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
+TEST_P(RngNormalTableTestD, Result) {
+  double meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD,
+                         ::testing::ValuesIn(inputsd_t));
+
+struct RngAffineInputs {
+  int n;
+  unsigned long long seed;
+};
+
+class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<RngAffineInputs>::GetParam();
+    Rng r(params.seed);
+    r.affine_transform_params(params.n, a, b);
+  }
+
+  void check() {
+    ASSERT_TRUE(gcd(a, params.n) == 1);
+    ASSERT_TRUE(0 <= b && b < params.n);
+  }
+
+ private:
+  RngAffineInputs params;
+  int a, b;
+};  // RngAffineTest
+
+const std::vector<RngAffineInputs> inputs_affine = {
+  {100, 123456ULL},     {100, 1234567890ULL},  {101, 123456ULL},
+  {101, 1234567890ULL}, {7, 123456ULL},        {7, 1234567890ULL},
+  {2568, 123456ULL},    {2568, 1234567890ULL},
+};
+TEST_P(RngAffineTest, Result) { check(); }
+INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest,
+                         ::testing::ValuesIn(inputs_affine));
+
+}  // namespace random
+}  // namespace raft
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
new file mode 100644
index 0000000000..92f12206e8
--- /dev/null
+++ b/cpp/test/random/rng_int.cu
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace random {
+
+enum RandomType { RNG_Uniform };
+
+template <typename T, int TPB>
+__global__ void meanKernel(float *out, const T *data, int len) {
+  typedef cub::BlockReduce<float, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  float val = tid < len ? data[tid] : T(0);
+  float x = BlockReduce(temp_storage).Sum(val);
+  __syncthreads();
+  float xx = BlockReduce(temp_storage).Sum(val * val);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    raft::myAtomicAdd(out, x);
+    raft::myAtomicAdd(out + 1, xx);
+  }
+}
+
+template <typename T>
+struct RngInputs {
+  float tolerance;
+  int len;
+  // start, end: for uniform
+  // mean, sigma: for normal/lognormal
+  // mean, beta: for gumbel
+  // mean, scale: for logistic and laplace
+  // lambda: for exponential
+  // sigma: for rayleigh
+  T start, end;
+  RandomType type;
+  GeneratorType gtype;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const RngInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
+    Rng r(params.seed, params.gtype);
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    allocate(data, params.len);
+    allocate(stats, 2, true);
+    switch (params.type) {
+      case RNG_Uniform:
+        r.uniformInt(data, params.len, params.start, params.end, stream);
+        break;
+    };
+    static const int threads = 128;
+    meanKernel<T, threads>
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
+                                                                   params.len);
+    update_host<float>(h_stats, stats, 2, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    h_stats[0] /= params.len;
+    h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(stats));
+  }
+
+  void getExpectedMeanVar(float meanvar[2]) {
+    switch (params.type) {
+      case RNG_Uniform:
+        meanvar[0] = (params.start + params.end) * 0.5f;
+        meanvar[1] = params.end - params.start;
+        meanvar[1] = meanvar[1] * meanvar[1] / 12.f;
+        break;
+    };
+  }
+
+ protected:
+  RngInputs<T> params;
+  T *data;
+  float *stats;
+  float h_stats[2];  // mean, var
+};
+
+typedef RngTest<uint32_t> RngTestU32;
+const std::vector<RngInputs<uint32_t>> inputs_u32 = {
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+TEST_P(RngTestU32, Result) {
+  float meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32));
+
+typedef RngTest<uint64_t> RngTestU64;
+const std::vector<RngInputs<uint64_t>> inputs_u64 = {
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+TEST_P(RngTestU64, Result) {
+  float meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64));
+
+typedef RngTest<int32_t> RngTestS32;
+const std::vector<RngInputs<int32_t>> inputs_s32 = {
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+TEST_P(RngTestS32, Result) {
+  float meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32));
+
+typedef RngTest<int64_t> RngTestS64;
+const std::vector<RngInputs<int64_t>> inputs_s64 = {
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+TEST_P(RngTestS64, Result) {
+  float meanvar[2];
+  getExpectedMeanVar(meanvar);
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64));
+
+}  // namespace random
+}  // namespace raft
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
new file mode 100644
index 0000000000..d7e52a8958
--- /dev/null
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.cuh>
+#include <set>
+#include <vector>
+#include "../test_utils.h"
+
+namespace raft {
+namespace random {
+
+// Terminology:
+// SWoR - Sample Without Replacement
+
+template <typename T>
+struct SWoRInputs {
+  int len, sampledLen;
+  int largeWeightIndex;
+  T largeWeight;
+  GeneratorType gtype;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims) {
+  return os;
+}
+
+template <typename T>
+class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<SWoRInputs<T>>::GetParam();
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    Rng r(params.seed, params.gtype);
+    allocate(in, params.len);
+    allocate(wts, params.len);
+    allocate(out, params.sampledLen);
+    allocate(outIdx, params.sampledLen);
+    h_outIdx.resize(params.sampledLen);
+    r.uniform(in, params.len, T(-1.0), T(1.0), stream);
+    r.uniform(wts, params.len, T(1.0), T(2.0), stream);
+    if (params.largeWeightIndex >= 0) {
+      update_device(wts + params.largeWeightIndex, &params.largeWeight, 1,
+                    stream);
+    }
+    r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen,
+                               params.len, stream);
+    update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaFree(in));
+    CUDA_CHECK(cudaFree(wts));
+    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaFree(outIdx));
+  }
+
+ protected:
+  SWoRInputs<T> params;
+  T *in, *out, *wts;
+  int* outIdx;
+  std::vector<int> h_outIdx;
+  cudaStream_t stream;
+  raft::handle_t handle;
+};
+
+typedef SWoRTest<float> SWoRTestF;
+const std::vector<SWoRInputs<float>> inputsf = {
+  {1024, 512, -1, 0.f, GenPhilox, 1234ULL},
+  {1024, 1024, -1, 0.f, GenPhilox, 1234ULL},
+  {1024, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+  {1024, 1024 - 1, -1, 0.f, GenPhilox, 1234ULL},
+  {1024, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+  {1024, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 1, 512, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 1, 1024, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 1, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 1, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 1, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 1, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 2, 512, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 2, 1024, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 2, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 2, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 2, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+  {1024 + 2, 1024 + 2, -1, 0.f, GenPhilox, 1234ULL},
+  {1024, 512, 10, 100000.f, GenPhilox, 1234ULL},
+
+  {1024, 512, -1, 0.f, GenTaps, 1234ULL},
+  {1024, 1024, -1, 0.f, GenTaps, 1234ULL},
+  {1024, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
+  {1024, 1024 - 1, -1, 0.f, GenTaps, 1234ULL},
+  {1024, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
+  {1024, 1024 - 2, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 1, 512, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 1, 1024, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 1, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 1, 1024 + 1, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 1, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 1, 1024 - 2, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 2, 512, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 2, 1024, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 2, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 2, 1024 + 1, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 2, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
+  {1024 + 2, 1024 + 2, -1, 0.f, GenTaps, 1234ULL},
+  {1024, 512, 10, 100000.f, GenTaps, 1234ULL},
+
+  {1024, 512, -1, 0.f, GenKiss99, 1234ULL},
+  {1024, 1024, -1, 0.f, GenKiss99, 1234ULL},
+  {1024, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
+  {1024, 1024 - 1, -1, 0.f, GenKiss99, 1234ULL},
+  {1024, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
+  {1024, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 1, 512, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 1, 1024, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 1, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 1, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 1, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 1, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 2, 512, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 2, 1024, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 2, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 2, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 2, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
+  {1024 + 2, 1024 + 2, -1, 0.f, GenKiss99, 1234ULL},
+  {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
+};
+
+TEST_P(SWoRTestF, Result) {
+  std::set<int> occurence;
+  for (int i = 0; i < params.sampledLen; ++i) {
+    auto val = h_outIdx[i];
+    // indices must be in the given range
+    ASSERT_TRUE(0 <= val && val < params.len)
+      << "out-of-range index @i=" << i << " val=" << val
+      << " sampledLen=" << params.sampledLen;
+    // indices should not repeat
+    ASSERT_TRUE(occurence.find(val) == occurence.end())
+      << "repeated index @i=" << i << " idx=" << val;
+    occurence.insert(val);
+  }
+  // if there's a skewed distribution, the top index should correspond to the
+  // particular item with a large weight
+  if (params.largeWeightIndex >= 0) {
+    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
+  }
+}
+INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
+
+typedef SWoRTest<double> SWoRTestD;
+const std::vector<SWoRInputs<double>> inputsd = {
+  {1024, 512, -1, 0.0, GenPhilox, 1234ULL},
+  {1024, 1024, -1, 0.0, GenPhilox, 1234ULL},
+  {1024, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+  {1024, 1024 - 1, -1, 0.0, GenPhilox, 1234ULL},
+  {1024, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+  {1024, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 1, 512, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 1, 1024, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 1, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 1, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 1, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 1, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 2, 512, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 2, 1024, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 2, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 2, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 2, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+  {1024 + 2, 1024 + 2, -1, 0.0, GenPhilox, 1234ULL},
+  {1024, 512, 10, 100000.0, GenPhilox, 1234ULL},
+
+  {1024, 512, -1, 0.0, GenTaps, 1234ULL},
+  {1024, 1024, -1, 0.0, GenTaps, 1234ULL},
+  {1024, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
+  {1024, 1024 - 1, -1, 0.0, GenTaps, 1234ULL},
+  {1024, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
+  {1024, 1024 - 2, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 1, 512, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 1, 1024, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 1, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 1, 1024 + 1, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 1, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 1, 1024 - 2, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 2, 512, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 2, 1024, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 2, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 2, 1024 + 1, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 2, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
+  {1024 + 2, 1024 + 2, -1, 0.0, GenTaps, 1234ULL},
+  {1024, 512, 10, 100000.0, GenTaps, 1234ULL},
+
+  {1024, 512, -1, 0.0, GenKiss99, 1234ULL},
+  {1024, 1024, -1, 0.0, GenKiss99, 1234ULL},
+  {1024, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
+  {1024, 1024 - 1, -1, 0.0, GenKiss99, 1234ULL},
+  {1024, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
+  {1024, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 1, 512, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 1, 1024, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 1, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 1, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 1, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 1, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 2, 512, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 2, 1024, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 2, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 2, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 2, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
+  {1024 + 2, 1024 + 2, -1, 0.0, GenKiss99, 1234ULL},
+  {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
+};
+
+TEST_P(SWoRTestD, Result) {
+  std::set<int> occurence;
+  for (int i = 0; i < params.sampledLen; ++i) {
+    auto val = h_outIdx[i];
+    // indices must be in the given range
+    ASSERT_TRUE(0 <= val && val < params.len)
+      << "out-of-range index @i=" << i << " val=" << val
+      << " sampledLen=" << params.sampledLen;
+    // indices should not repeat
+    ASSERT_TRUE(occurence.find(val) == occurence.end())
+      << "repeated index @i=" << i << " idx=" << val;
+    occurence.insert(val);
+  }
+  // if there's a skewed distribution, the top index should correspond to the
+  // particular item with a large weight
+  if (params.largeWeightIndex >= 0) {
+    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
+  }
+}
+INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd));
+
+}  // namespace random
+}  // namespace raft
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
new file mode 100644
index 0000000000..4a3b0ed196
--- /dev/null
+++ b/cpp/test/stats/mean.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct MeanInputs {
+  T tolerance, mean;
+  int rows, cols;
+  bool sample, rowMajor;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const MeanInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<MeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+
+    int rows = params.rows, cols = params.cols;
+    int len = rows * cols;
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    allocate(data, len);
+    allocate(mean_act, cols);
+    r.normal(data, len, params.mean, (T)1.0, stream);
+
+    meanSGtest(data, stream);
+  }
+
+  void meanSGtest(T *data, cudaStream_t stream) {
+    int rows = params.rows, cols = params.cols;
+
+    mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(mean_act));
+  }
+
+ protected:
+  MeanInputs<T> params;
+  T *data, *mean_act;
+};
+
+// Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
+// measured mean (of a normal distribution) will fall outside of an epsilon of
+// 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
+const std::vector<MeanInputs<float>> inputsf = {
+  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
+
+const std::vector<MeanInputs<double>> inputsd = {
+  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
+
+typedef MeanTest<float> MeanTestF;
+TEST_P(MeanTestF, Result) {
+  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef MeanTest<double> MeanTestD;
+TEST_P(MeanTestD, Result) {
+  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
new file mode 100644
index 0000000000..8b0d607561
--- /dev/null
+++ b/cpp/test/stats/mean_center.cu
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
+#include "../linalg/matrix_vector_op.cuh"
+#include "../test_utils.h"
+
+namespace raft {
+namespace stats {
+
+template <typename T, typename IdxType>
+struct MeanCenterInputs {
+  T tolerance, mean;
+  IdxType rows, cols;
+  bool sample, rowMajor, bcastAlongRows;
+  unsigned long long int seed;
+};
+
+template <typename T, typename IdxType>
+::std::ostream &operator<<(::std::ostream &os,
+                           const MeanCenterInputs<T, IdxType> &dims) {
+  return os;
+}
+
+template <typename T, typename IdxType>
+class MeanCenterTest
+  : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam();
+    raft::random::Rng r(params.seed);
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    auto rows = params.rows, cols = params.cols;
+    auto len = rows * cols;
+    IdxType vecLen = params.bcastAlongRows ? cols : rows;
+
+    raft::allocate(out, len);
+    raft::allocate(out_ref, len);
+    raft::allocate(data, len);
+    raft::allocate(meanVec, vecLen);
+    r.normal(data, len, params.mean, (T)1.0, stream);
+    raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor,
+                      stream);
+    meanCenter(out, data, meanVec, cols, rows, params.rowMajor,
+               params.bcastAlongRows, stream);
+    raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows,
+                              params.rowMajor, params.bcastAlongRows, (T)-1.0);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaFree(out_ref));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(meanVec));
+  }
+
+ protected:
+  MeanCenterInputs<T, IdxType> params;
+  T *data, *meanVec, *out, *out_ref;
+};
+
+const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
+  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
+  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
+  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
+  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
+typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
+TEST_P(MeanCenterTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
+
+const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
+  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
+  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
+  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
+  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
+  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
+  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
+typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
+TEST_P(MeanCenterTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
+
+const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
+  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
+  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
+  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
+  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
+typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
+TEST_P(MeanCenterTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
+
+const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
+  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
+  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
+  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
+  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
+  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
+  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
+typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
+TEST_P(MeanCenterTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
new file mode 100644
index 0000000000..ff2698788f
--- /dev/null
+++ b/cpp/test/stats/stddev.cu
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/stddev.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct StdDevInputs {
+  T tolerance, mean, stddev;
+  int rows, cols;
+  bool sample, rowMajor;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const StdDevInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<StdDevInputs<T>>::GetParam();
+    random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    int len = rows * cols;
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    allocate(data, len);
+    allocate(mean_act, cols);
+    allocate(stddev_act, cols);
+    allocate(vars_act, cols);
+    r.normal(data, len, params.mean, params.stddev, stream);
+    stdVarSGtest(data, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void stdVarSGtest(T *data, cudaStream_t stream) {
+    int rows = params.rows, cols = params.cols;
+
+    mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
+
+    stddev(stddev_act, data, mean_act, cols, rows, params.sample,
+           params.rowMajor, stream);
+
+    vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor,
+         stream);
+
+    raft::matrix::seqRoot(vars_act, T(1), cols, stream);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(mean_act));
+    CUDA_CHECK(cudaFree(stddev_act));
+    CUDA_CHECK(cudaFree(vars_act));
+  }
+
+ protected:
+  StdDevInputs<T> params;
+  T *data, *mean_act, *stddev_act, *vars_act;
+};
+
+const std::vector<StdDevInputs<float>> inputsf = {
+  {0.1f, 1.f, 2.f, 1024, 32, true, false, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 64, true, false, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 128, true, false, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 256, true, false, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 32, false, false, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 64, false, false, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 128, false, false, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 256, false, false, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 32, true, true, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 64, true, true, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 128, true, true, 1234ULL},
+  {0.1f, 1.f, 2.f, 1024, 256, true, true, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 32, false, true, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 64, false, true, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 128, false, true, 1234ULL},
+  {0.1f, -1.f, 2.f, 1024, 256, false, true, 1234ULL}};
+
+const std::vector<StdDevInputs<double>> inputsd = {
+  {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 64, true, false, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 128, true, false, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 256, true, false, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 32, false, false, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 64, false, false, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 128, false, false, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 256, false, false, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 32, true, true, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 64, true, true, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 128, true, true, 1234ULL},
+  {0.1, 1.0, 2.0, 1024, 256, true, true, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 32, false, true, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 64, false, true, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 128, false, true, 1234ULL},
+  {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}};
+
+typedef StdDevTest<float> StdDevTestF;
+TEST_P(StdDevTestF, Result) {
+  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
+                          CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
+                          CompareApprox<float>(params.tolerance)));
+}
+
+typedef StdDevTest<double> StdDevTestD;
+TEST_P(StdDevTestD, Result) {
+  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
+                          CompareApprox<double>(params.tolerance)));
+
+  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF,
+                         ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD,
+                         ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
new file mode 100644
index 0000000000..c3140d4588
--- /dev/null
+++ b/cpp/test/stats/sum.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/sum.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct SumInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os, const SumInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<SumInputs<T>>::GetParam();
+    int rows = params.rows, cols = params.cols;
+    int len = rows * cols;
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(data, len);
+
+    T data_h[len];
+    for (int i = 0; i < len; i++) {
+      data_h[i] = T(1);
+    }
+
+    raft::update_device(data, data_h, len, stream);
+
+    raft::allocate(sum_act, cols);
+    sum(sum_act, data, cols, rows, false, stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(sum_act));
+  }
+
+ protected:
+  SumInputs<T> params;
+  T *data, *sum_act;
+};
+
+const std::vector<SumInputs<float>> inputsf = {{0.05f, 1024, 32, 1234ULL},
+                                               {0.05f, 1024, 256, 1234ULL}};
+
+const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
+                                                {0.05, 1024, 256, 1234ULL}};
+
+typedef SumTest<float> SumTestF;
+TEST_P(SumTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef SumTest<double> SumTestD;
+TEST_P(SumTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(SumTests, SumTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
new file mode 100644
index 0000000000..1629e8aa34
--- /dev/null
+++ b/cpp/test/test_utils.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <iostream>
+#include <memory>
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+
+template <typename T>
+struct Compare {
+  bool operator()(const T &a, const T &b) const { return a == b; }
+};
+
+template <typename T>
+struct CompareApprox {
+  CompareApprox(T eps_) : eps(eps_) {}
+  bool operator()(const T &a, const T &b) const {
+    T diff = abs(a - b);
+    T m = std::max(abs(a), abs(b));
+    T ratio = diff >= eps ? diff / m : diff;
+
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename T>
+struct CompareApproxAbs {
+  CompareApproxAbs(T eps_) : eps(eps_) {}
+  bool operator()(const T &a, const T &b) const {
+    T diff = abs(abs(a) - abs(b));
+    T m = std::max(abs(a), abs(b));
+    T ratio = diff >= eps ? diff / m : diff;
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename T>
+T abs(const T &a) {
+  return a > T(0) ? a : -a;
+}
+
+/*
+     * @brief Helper function to compare 2 device n-D arrays with custom comparison
+     * @tparam T the data type of the arrays
+     * @tparam L the comparator lambda or object function
+     * @param expected expected value(s)
+     * @param actual actual values
+     * @param eq_compare the comparator
+     * @param stream cuda stream
+     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+     * @{
+     */
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(const T *expected, const T *actual,
+                                     size_t size, L eq_compare,
+                                     cudaStream_t stream = 0) {
+  std::shared_ptr<T> exp_h(new T[size]);
+  std::shared_ptr<T> act_h(new T[size]);
+  raft::update_host<T>(exp_h.get(), expected, size, stream);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < size; ++i) {
+    auto exp = exp_h.get()[i];
+    auto act = act_h.get()[i];
+    if (!eq_compare(exp, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << exp << " @" << i;
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
+                                     L eq_compare, cudaStream_t stream = 0) {
+  std::shared_ptr<T> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < size; ++i) {
+    auto act = act_h.get()[i];
+    if (!eq_compare(expected, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << expected << " @" << i;
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(const T *expected, const T *actual,
+                                     size_t rows, size_t cols, L eq_compare,
+                                     cudaStream_t stream = 0) {
+  size_t size = rows * cols;
+  std::shared_ptr<T> exp_h(new T[size]);
+  std::shared_ptr<T> act_h(new T[size]);
+  raft::update_host<T>(exp_h.get(), expected, size, stream);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      auto idx = i * cols + j;  // row major assumption!
+      auto exp = exp_h.get()[idx];
+      auto act = act_h.get()[idx];
+      if (!eq_compare(exp, act)) {
+        return testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << exp << " @" << i << ","
+               << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
+                                     size_t cols, L eq_compare,
+                                     cudaStream_t stream = 0) {
+  size_t size = rows * cols;
+  std::shared_ptr<T> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      auto idx = i * cols + j;  // row major assumption!
+      auto act = act_h.get()[idx];
+      if (!eq_compare(expected, act)) {
+        return testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << expected << " @" << i
+               << "," << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+/*
+     * @brief Helper function to compare a device n-D arrays with an expected array
+     * on the host, using a custom comparison
+     * @tparam T the data type of the arrays
+     * @tparam L the comparator lambda or object function
+     * @param expected_h host array of expected value(s)
+     * @param actual_d device array actual values
+     * @param eq_compare the comparator
+     * @param stream cuda stream
+     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+     */
+template <typename T, typename L>
+testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
+                                         size_t size, L eq_compare,
+                                         cudaStream_t stream = 0) {
+  std::shared_ptr<T> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual_d, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  bool ok = true;
+  auto fail = testing::AssertionFailure();
+  for (size_t i(0); i < size; ++i) {
+    auto exp = expected_h[i];
+    auto act = act_h.get()[i];
+    if (!eq_compare(exp, act)) {
+      ok = false;
+      fail << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
+    }
+  }
+  if (!ok) return fail;
+  return testing::AssertionSuccess();
+}
+
+/*
+     * @brief Helper function to compare diagonal values of a 2D matrix
+     * @tparam T the data type of the arrays
+     * @tparam L the comparator lambda or object function
+     * @param expected expected value along diagonal
+     * @param actual actual matrix
+     * @param eq_compare the comparator
+     * @param stream cuda stream
+     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+     */
+template <typename T, typename L>
+testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
+                                       size_t cols, L eq_compare,
+                                       cudaStream_t stream = 0) {
+  size_t size = rows * cols;
+  std::shared_ptr<T> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      if (i != j) continue;
+      auto idx = i * cols + j;  // row major assumption!
+      auto act = act_h.get()[idx];
+      if (!eq_compare(expected, act)) {
+        return testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << expected << " @" << i
+               << "," << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult match(const T expected, T actual, L eq_compare) {
+  if (!eq_compare(expected, actual)) {
+    return testing::AssertionFailure()
+           << "actual=" << actual << " != expected=" << expected;
+  }
+  return testing::AssertionSuccess();
+}
+
+/** @} */
+
+/** time the function call 'func' using cuda events */
+#define TIMEIT_LOOP(ms, count, func)                    \
+  do {                                                  \
+    cudaEvent_t start, stop;                            \
+    CUDA_CHECK(cudaEventCreate(&start));                \
+    CUDA_CHECK(cudaEventCreate(&stop));                 \
+    CUDA_CHECK(cudaEventRecord(start));                 \
+    for (int i = 0; i < count; ++i) {                   \
+      func;                                             \
+    }                                                   \
+    CUDA_CHECK(cudaEventRecord(stop));                  \
+    CUDA_CHECK(cudaEventSynchronize(stop));             \
+    ms = 0.f;                                           \
+    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop)); \
+    ms /= args.runs;                                    \
+  } while (0)
+
+};  // end namespace raft