From 4c089f681d96f4bcf927842c098eab84a0920925 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 11 Feb 2022 18:21:59 -0500
Subject: [PATCH] Moving remaining stats prims from cuml (#507)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/507
---
 cpp/include/raft/common/seive.hpp             | 125 +++++
 cpp/include/raft/device_utils.cuh             | 108 ++++
 cpp/include/raft/random/detail/make_blobs.cuh | 133 +++--
 .../knn/detail/epsilon_neighborhood.cuh       | 243 +++++++++
 .../raft/spatial/knn/epsilon_neighborhood.hpp |  59 +++
 cpp/include/raft/stats/common.hpp             |  67 +++
 cpp/include/raft/stats/cov.hpp                |  58 +++
 cpp/include/raft/stats/detail/cov.cuh         |  95 ++++
 cpp/include/raft/stats/detail/histogram.cuh   | 492 ++++++++++++++++++
 cpp/include/raft/stats/detail/minmax.cuh      | 239 +++++++++
 .../raft/stats/detail/weighted_mean.cuh       |  94 ++++
 cpp/include/raft/stats/histogram.hpp          |  62 +++
 cpp/include/raft/stats/minmax.hpp             |  70 +++
 cpp/include/raft/stats/weighted_mean.hpp      |  60 +++
 cpp/test/CMakeLists.txt                       |   6 +
 cpp/test/common/seive.cu                      |  35 ++
 cpp/test/linalg/rsvd.cu                       | 112 ++--
 cpp/test/random/make_blobs.cu                 | 108 ++--
 cpp/test/spatial/epsilon_neighborhood.cu      | 129 +++++
 cpp/test/stats/cov.cu                         | 185 +++++++
 cpp/test/stats/histogram.cu                   | 262 ++++++++++
 cpp/test/stats/minmax.cu                      | 202 +++++++
 cpp/test/stats/weighted_mean.cu               | 231 ++++++++
 23 files changed, 3027 insertions(+), 148 deletions(-)
 create mode 100644 cpp/include/raft/common/seive.hpp
 create mode 100644 cpp/include/raft/device_utils.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
 create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
 create mode 100644 cpp/include/raft/stats/common.hpp
 create mode 100644 cpp/include/raft/stats/cov.hpp
 create mode 100644 cpp/include/raft/stats/detail/cov.cuh
 create mode 100644 cpp/include/raft/stats/detail/histogram.cuh
 create mode 100644 cpp/include/raft/stats/detail/minmax.cuh
 create mode 100644 cpp/include/raft/stats/detail/weighted_mean.cuh
 create mode 100644 cpp/include/raft/stats/histogram.hpp
 create mode 100644 cpp/include/raft/stats/minmax.hpp
 create mode 100644 cpp/include/raft/stats/weighted_mean.hpp
 create mode 100644 cpp/test/common/seive.cu
 create mode 100644 cpp/test/spatial/epsilon_neighborhood.cu
 create mode 100644 cpp/test/stats/cov.cu
 create mode 100644 cpp/test/stats/histogram.cu
 create mode 100644 cpp/test/stats/minmax.cu
 create mode 100644 cpp/test/stats/weighted_mean.cu
diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp
new file mode 100644
index 0000000000..e613f1e5c2
--- /dev/null
+++ b/cpp/include/raft/common/seive.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <vector>
+
+// Taken from:
+//  https://github.com/teju85/programming/blob/master/euler/include/seive.h
+
+namespace raft {
+namespace common {
+
+/**
+ * @brief Implementation of 'Seive of Eratosthenes'
+ */
+class Seive {
+ public:
+  /**
+   * @param _num number of integers for which seive is needed
+   */
+  Seive(unsigned _num)
+  {
+    N = _num;
+    generateSeive();
+  }
+
+  /**
+   * @brief Check whether a number is prime or not
+   * @param num number to be checked
+   * @return true if the 'num' is prime, else false
+   */
+  bool isPrime(unsigned num) const
+  {
+    unsigned mask, pos;
+    if (num <= 1) { return false; }
+    if (num == 2) { return true; }
+    if (!(num & 1)) { return false; }
+    getMaskPos(num, mask, pos);
+    return (seive[pos] & mask);
+  }
+
+ private:
+  void generateSeive()
+  {
+    auto sqN  = fastIntSqrt(N);
+    auto size = raft::ceildiv<unsigned>(N, sizeof(unsigned) * 8);
+    seive.resize(size);
+    // assume all to be primes initially
+    for (auto& itr : seive) {
+      itr = 0xffffffffu;
+    }
+    unsigned cid  = 0;
+    unsigned cnum = getNum(cid);
+    while (cnum <= sqN) {
+      do {
+        ++cid;
+        cnum = getNum(cid);
+        if (isPrime(cnum)) { break; }
+      } while (cnum <= sqN);
+      auto cnum2 = cnum << 1;
+      // 'unmark' all the 'odd' multiples of the current prime
+      for (unsigned i = 3, num = i * cnum; num <= N; i += 2, num += cnum2) {
+        unmark(num);
+      }
+    }
+  }
+
+  unsigned getId(unsigned num) const { return (num >> 1); }
+
+  unsigned getNum(unsigned id) const
+  {
+    if (id == 0) { return 2; }
+    return ((id << 1) + 1);
+  }
+
+  void getMaskPos(unsigned num, unsigned& mask, unsigned& pos) const
+  {
+    pos  = getId(num);
+    mask = 1 << (pos & 0x1f);
+    pos >>= 5;
+  }
+
+  void unmark(unsigned num)
+  {
+    unsigned mask, pos;
+    getMaskPos(num, mask, pos);
+    seive[pos] &= ~mask;
+  }
+
+  // REF: http://www.azillionmonkeys.com/qed/ulerysqroot.pdf
+  unsigned fastIntSqrt(unsigned val)
+  {
+    unsigned g = 0;
+    auto bshft = 15u, b = 1u << bshft;
+    do {
+      unsigned temp = ((g << 1) + b) << bshft--;
+      if (val >= temp) {
+        g += b;
+        val -= temp;
+      }
+    } while (b >>= 1);
+    return g;
+  }
+
+  /** find all primes till this number */
+  unsigned N;
+  /** the seive */
+  std::vector<unsigned> seive;
+};
+};  // namespace common
+};  // namespace raft
diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh
new file mode 100644
index 0000000000..d89a484109
--- /dev/null
+++ b/cpp/include/raft/device_utils.cuh
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <utility>  // pair
+
+namespace raft {
+
+// TODO move to raft https://github.com/rapidsai/raft/issues/90
+/** helper method to get the compute capability version numbers */
+inline std::pair<int, int> getDeviceCapability()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int major, minor;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId));
+  return std::make_pair(major, minor);
+}
+
+/**
+ * @brief Batched warp-level sum reduction
+ *
+ * @tparam T        data type
+ * @tparam NThreads Number of threads in the warp doing independent reductions
+ *
+ * @param[in] val input value
+ * @return        for the first "group" of threads, the reduced value. All
+ *                others will contain unusable values!
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block and also doesn't support this kind of
+ *       batched reduction operation
+ * @note All threads in the warp must enter this function together
+ *
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T, int NThreads>
+DI T batchedWarpReduce(T val)
+{
+#pragma unroll
+  for (int i = NThreads; i < raft::WarpSize; i <<= 1) {
+    val += raft::shfl(val, raft::laneId() + i);
+  }
+  return val;
+}
+
+/**
+ * @brief 1-D block-level batched sum reduction
+ *
+ * @tparam T        data type
+ * @tparam NThreads Number of threads in the warp doing independent reductions
+ *
+ * @param val  input value
+ * @param smem shared memory region needed for storing intermediate results. It
+ *             must alteast be of size: `sizeof(T) * nWarps * NThreads`
+ * @return     for the first "group" of threads in the block, the reduced value.
+ *             All others will contain unusable values!
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block and also doesn't support this kind of
+ *       batched reduction operation
+ * @note All threads in the block must enter this function together
+ *
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T, int NThreads>
+DI T batchedBlockReduce(T val, char* smem)
+{
+  auto* sTemp                  = reinterpret_cast<T*>(smem);
+  constexpr int nGroupsPerWarp = raft::WarpSize / NThreads;
+  static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!");
+  const int nGroups = (blockDim.x + NThreads - 1) / NThreads;
+  const int lid     = raft::laneId();
+  const int lgid    = lid % NThreads;
+  const int gid     = threadIdx.x / NThreads;
+  const auto wrIdx  = (gid / nGroupsPerWarp) * NThreads + lgid;
+  const auto rdIdx  = gid * NThreads + lgid;
+  for (int i = nGroups; i > 0;) {
+    auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp;
+    if (gid < iAligned) {
+      val = batchedWarpReduce<T, NThreads>(val);
+      if (lid < NThreads) sTemp[wrIdx] = val;
+    }
+    __syncthreads();
+    i /= nGroupsPerWarp;
+    if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); }
+    __syncthreads();
+  }
+  return val;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 528d20a284..fff1ab835b 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include "permute.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.hpp>
-#include <raft/random/permute.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
-namespace raft::random {
-namespace detail {
+namespace raft {
+namespace random {
 
-namespace {
+namespace detail {
 
 // generate the labels first and shuffle them instead of shuffling the dataset
 template <typename IdxT>
@@ -90,23 +90,29 @@ DI void get_mu_sigma(DataT& mu,
 }
 
 template <typename DataT, typename IdxT>
-void generate_data(DataT* out,
-                   const IdxT* labels,
-                   IdxT n_rows,
-                   IdxT n_cols,
-                   IdxT n_clusters,
-                   cudaStream_t stream,
-                   bool row_major,
-                   const DataT* centers,
-                   const DataT* cluster_std,
-                   const DataT cluster_std_scalar,
-                   raft::random::Rng& rng)
+__global__ void generate_data_kernel(DataT* out,
+                                     const IdxT* labels,
+                                     IdxT n_rows,
+                                     IdxT n_cols,
+                                     IdxT n_clusters,
+                                     bool row_major,
+                                     const DataT* centers,
+                                     const DataT* cluster_std,
+                                     const DataT cluster_std_scalar,
+                                     raft::random::RngState rng_state)
 {
-  auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) {
+  uint64_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  raft::random::PhiloxGenerator gen(rng_state, tid);
+  const IdxT stride = gridDim.x * blockDim.x;
+  IdxT len          = n_rows * n_cols;
+  for (IdxT idx = tid; idx < len; idx += stride) {
+    DataT val1, val2;
+    gen.next(val1);
+    gen.next(val2);
     DataT mu1, sigma1, mu2, sigma2;
     get_mu_sigma(mu1,
                  sigma1,
-                 idx1,
+                 idx,
                  labels,
                  row_major,
                  centers,
@@ -117,7 +123,7 @@ void generate_data(DataT* out,
                  n_clusters);
     get_mu_sigma(mu2,
                  sigma2,
-                 idx2,
+                 idx + stride,
                  labels,
                  row_major,
                  centers,
@@ -127,12 +133,74 @@ void generate_data(DataT* out,
                  n_cols,
                  n_clusters);
     raft::random::box_muller_transform<DataT>(val1, val2, sigma1, mu1, sigma2, mu2);
-  };
-  rng.custom_distribution2<DataT, DataT, IdxT>(out, n_rows * n_cols, op, stream);
+
+    if (idx < len) out[idx] = val1;
+    idx += stride;
+    if (idx < len) out[idx] = val2;
+  }
 }
 
-}  // namespace
+template <typename DataT, typename IdxT>
+void generate_data(DataT* out,
+                   const IdxT* labels,
+                   IdxT n_rows,
+                   IdxT n_cols,
+                   IdxT n_clusters,
+                   cudaStream_t stream,
+                   bool row_major,
+                   const DataT* centers,
+                   const DataT* cluster_std,
+                   const DataT cluster_std_scalar,
+                   raft::random::RngState& rng_state)
+{
+  IdxT items   = n_rows * n_cols;
+  IdxT nBlocks = (items + 127) / 128;
+  generate_data_kernel<<<nBlocks, 128, 0, stream>>>(out,
+                                                    labels,
+                                                    n_rows,
+                                                    n_cols,
+                                                    n_clusters,
+                                                    row_major,
+                                                    centers,
+                                                    cluster_std,
+                                                    cluster_std_scalar,
+                                                    rng_state);
+}
 
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
 template <typename DataT, typename IdxT>
 void make_blobs_caller(DataT* out,
                        IdxT* labels,
@@ -140,15 +208,15 @@ void make_blobs_caller(DataT* out,
                        IdxT n_cols,
                        IdxT n_clusters,
                        cudaStream_t stream,
-                       bool row_major                   = true,
-                       const DataT* centers             = nullptr,
-                       const DataT* cluster_std         = nullptr,
-                       const DataT cluster_std_scalar   = (DataT)1.0,
-                       bool shuffle                     = true,
-                       DataT center_box_min             = (DataT)-10.0,
-                       DataT center_box_max             = (DataT)10.0,
-                       uint64_t seed                    = 0ULL,
-                       raft::random::GeneratorType type = raft::random::GenPhilox)
+                       bool row_major,
+                       const DataT* centers,
+                       const DataT* cluster_std,
+                       const DataT cluster_std_scalar,
+                       bool shuffle,
+                       DataT center_box_min,
+                       DataT center_box_max,
+                       uint64_t seed,
+                       raft::random::GeneratorType type)
 {
   raft::random::Rng r(seed, type);
   // use the right centers buffer for data generation
@@ -172,8 +240,9 @@ void make_blobs_caller(DataT* out,
                 _centers,
                 cluster_std,
                 cluster_std_scalar,
-                r);
+                r.state);
 }
 
 }  // end namespace detail
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
new file mode 100644
index 0000000000..3b4a8d4174
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/device_utils.cuh>
+#include <raft/linalg/contractions.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename DataT,
+          typename IdxT,
+          typename Policy,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy>>
+struct EpsUnexpL2SqNeighborhood : public BaseClass {
+ private:
+  typedef Policy P;
+
+  bool* adj;
+  DataT eps;
+  IdxT* vd;
+
+  char* smem;  // for final reductions
+
+  DataT acc[P::AccRowsPerTh][P::AccColsPerTh];
+
+ public:
+  DI EpsUnexpL2SqNeighborhood(bool* _adj,
+                              IdxT* _vd,
+                              const DataT* _x,
+                              const DataT* _y,
+                              IdxT _m,
+                              IdxT _n,
+                              IdxT _k,
+                              DataT _eps,
+                              char* _smem)
+    : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem)
+  {
+  }
+
+  DI void run()
+  {
+    prolog();
+    loop();
+    epilog();
+  }
+
+ private:
+  DI void prolog()
+  {
+    this->ldgXY(0);
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        acc[i][j] = BaseClass::Zero;
+      }
+    }
+    this->stsXY();
+    __syncthreads();
+    this->pageWr ^= 1;
+  }
+
+  DI void loop()
+  {
+    for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+      this->ldgXY(kidx);
+      accumulate();  // on the previous k-block
+      this->stsXY();
+      __syncthreads();
+      this->pageWr ^= 1;
+      this->pageRd ^= 1;
+    }
+    accumulate();  // last iteration
+  }
+
+  DI void epilog()
+  {
+    IdxT startx = blockIdx.x * P::Mblk + this->accrowid;
+    IdxT starty = blockIdx.y * P::Nblk + this->acccolid;
+    auto lid    = raft::laneId();
+    IdxT sums[P::AccColsPerTh];
+#pragma unroll
+    for (int j = 0; j < P::AccColsPerTh; ++j) {
+      sums[j] = 0;
+    }
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      auto xid = startx + i * P::AccThRows;
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto yid      = starty + j * P::AccThCols;
+        auto is_neigh = acc[i][j] <= eps;
+        ///@todo: fix uncoalesced writes using shared mem
+        if (xid < this->m && yid < this->n) {
+          adj[xid * this->n + yid] = is_neigh;
+          sums[j] += is_neigh;
+        }
+      }
+    }
+    // perform reduction of adjacency values to compute vertex degrees
+    if (vd != nullptr) { updateVertexDegree(sums); }
+  }
+
+  DI void accumulate()
+  {
+#pragma unroll
+    for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
+      this->ldsXY(ki);
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < P::AccColsPerTh; ++j) {
+#pragma unroll
+          for (int v = 0; v < P::Veclen; ++v) {
+            auto diff = this->regx[i][v] - this->regy[j][v];
+            acc[i][j] += diff * diff;
+          }
+        }
+      }
+    }
+  }
+
+  DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh])
+  {
+    __syncthreads();  // so that we can safely reuse smem
+    int gid       = threadIdx.x / P::AccThCols;
+    int lid       = threadIdx.x % P::AccThCols;
+    auto cidx     = IdxT(blockIdx.y) * P::Nblk + lid;
+    IdxT totalSum = 0;
+    // update the individual vertex degrees
+#pragma unroll
+    for (int i = 0; i < P::AccColsPerTh; ++i) {
+      sums[i]  = batchedBlockReduce<IdxT, P::AccThCols>(sums[i], smem);
+      auto cid = cidx + i * P::AccThCols;
+      if (gid == 0 && cid < this->n) {
+        atomicUpdate(cid, sums[i]);
+        totalSum += sums[i];
+      }
+      __syncthreads();  // for safe smem reuse
+    }
+    // update the total edge count
+    totalSum = raft::blockReduce<IdxT>(totalSum, smem);
+    if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); }
+  }
+
+  DI void atomicUpdate(IdxT addrId, IdxT val)
+  {
+    if (sizeof(IdxT) == 4) {
+      raft::myAtomicAdd<unsigned>((unsigned*)(vd + addrId), val);
+    } else if (sizeof(IdxT) == 8) {
+      raft::myAtomicAdd<unsigned long long>((unsigned long long*)(vd + addrId), val);
+    }
+  }
+};  // struct EpsUnexpL2SqNeighborhood
+
+template <typename DataT, typename IdxT, typename Policy>
+__global__ __launch_bounds__(Policy::Nthreads, 2)
+
+  void epsUnexpL2SqNeighKernel(
+    bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps)
+{
+  extern __shared__ char smem[];
+  EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
+  obj.run();
+}
+
+template <typename DataT, typename IdxT, int VecLen>
+void epsUnexpL2SqNeighImpl(bool* adj,
+                           IdxT* vd,
+                           const DataT* x,
+                           const DataT* y,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           DataT eps,
+                           cudaStream_t stream)
+{
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy Policy;
+  dim3 grid(raft::ceildiv<int>(m, Policy::Mblk), raft::ceildiv<int>(n, Policy::Nblk));
+  dim3 blk(Policy::Nthreads);
+  epsUnexpL2SqNeighKernel<DataT, IdxT, Policy>
+    <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  fop    device lambda to do any other custom functions
+ * @param[in]  stream cuda stream
+ */
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  size_t bytes = sizeof(DataT) * k;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 16 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 8 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+  } else {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 1>(adj, vd, x, y, m, n, k, eps, stream);
+  }
+}
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
new file mode 100644
index 0000000000..cd9163096a
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  m      number of rows in x
+ * @param[in]  n      number of rows in y
+ * @param[in]  k      number of columns in x and k
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  stream cuda stream
+ */
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp
new file mode 100644
index 0000000000..765f07a012
--- /dev/null
+++ b/cpp/include/raft/stats/common.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/** Default mapper which just returns the value of the data itself */
+template <typename DataT, typename IdxT>
+struct IdentityBinner {
+  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
+};
+
+/** Types of support histogram implementations */
+enum HistType {
+  /** shared mem atomics but with bins to be 1b int's */
+  HistTypeSmemBits1 = 1,
+  /** shared mem atomics but with bins to be 2b int's */
+  HistTypeSmemBits2 = 2,
+  /** shared mem atomics but with bins to be 4b int's */
+  HistTypeSmemBits4 = 4,
+  /** shared mem atomics but with bins to ba 1B int's */
+  HistTypeSmemBits8 = 8,
+  /** shared mem atomics but with bins to be 2B int's */
+  HistTypeSmemBits16 = 16,
+  /** use only global atomics */
+  HistTypeGmem,
+  /** uses shared mem atomics to reduce global traffic */
+  HistTypeSmem,
+  /**
+   * uses shared mem atomics with match_any intrinsic to further reduce shared
+   * memory traffic. This can only be enabled on Volta and later architectures.
+   * If one tries to enable this for older arch's, it will fall back to
+   * `HistTypeSmem`.
+   * @note This is to be used only when the input dataset leads to a lot of
+   *       repetitions in a given warp, else, this algo can be much slower than
+   *       `HistTypeSmem`!
+   */
+  HistTypeSmemMatchAny,
+  /** builds a hashmap of active bins in shared mem */
+  HistTypeSmemHash,
+  /** decide at runtime the best algo for the given inputs */
+  HistTypeAuto
+};
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
new file mode 100644
index 0000000000..dc5bc63ee8
--- /dev/null
+++ b/cpp/include/raft/stats/cov.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/cov.cuh>
+namespace raft {
+namespace stats {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
new file mode 100644
index 0000000000..7e3fc701a1
--- /dev/null
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/gemm.hpp>
+#include <raft/stats/mean_center.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  if (stable) {
+    cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+    // since mean operation is assumed to be along a given column, broadcast
+    // must be along rows!
+    raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream);
+    Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N));
+    Type beta  = Type(0);
+    if (rowMajor) {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       D,
+                                                       D,
+                                                       N,
+                                                       &alpha,
+                                                       data,
+                                                       D,
+                                                       data,
+                                                       D,
+                                                       &beta,
+                                                       covar,
+                                                       D,
+                                                       stream));
+    } else {
+      raft::linalg::gemm(
+        handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+    }
+  } else {
+    ///@todo: implement this using cutlass + customized epilogue!
+    ASSERT(false, "cov: Implement stable=false case!");
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
new file mode 100644
index 0000000000..65241f524f
--- /dev/null
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/common/seive.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/common.hpp>
+#include <raft/vectorized.cuh>
+#include <stdint.h>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+static const int ThreadsPerBlock = 256;
+
+template <typename IdxT, int VecLen>
+dim3 computeGridDim(IdxT nrows, IdxT ncols, const void* kernel)
+{
+  int occupancy;
+  RAFT_CUDA_TRY(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, ThreadsPerBlock, 0));
+  const auto maxBlks = occupancy * raft::getMultiProcessorCount();
+  int nblksx         = raft::ceildiv<int>(VecLen ? nrows / VecLen : nrows, ThreadsPerBlock);
+  // for cases when there aren't a lot of blocks for computing one histogram
+  nblksx = std::min(nblksx, maxBlks);
+  return dim3(nblksx, ncols);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, typename CoreOp>
+DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, CoreOp op, IdxT col)
+{
+  IdxT offset = col * nrows;
+  auto bdim   = IdxT(blockDim.x);
+  IdxT tid    = threadIdx.x + bdim * blockIdx.x;
+  tid *= VecLen;
+  IdxT stride = bdim * gridDim.x * VecLen;
+  int nCeil   = raft::alignTo<int>(nrows, stride);
+  typedef raft::TxN_t<DataT, VecLen> VecType;
+  VecType a;
+  for (auto i = tid; i < nCeil; i += stride) {
+    if (i < nrows) { a.load(data, offset + i); }
+#pragma unroll
+    for (int j = 0; j < VecLen; ++j) {
+      int binId = binner(a.val.data[j], i + j, col);
+      op(binId, i + j, col);
+    }
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+__global__ void gmemHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+    auto binOffset = col * nbins;
+#if __CUDA_ARCH__ < 700
+    raft::myAtomicAdd(bins + binOffset + binId, 1);
+#else
+    auto amask  = __activemask();
+    auto mask   = __match_any_sync(amask, binId);
+    auto leader = __ffs(mask) - 1;
+    if (raft::laneId() == leader) { raft::myAtomicAdd(bins + binOffset + binId, __popc(mask)); }
+#endif  // __CUDA_ARCH__
+  };
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, blockIdx.y);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void gmemHist(int* bins,
+              IdxT nbins,
+              const DataT* data,
+              IdxT nrows,
+              IdxT ncols,
+              BinnerOp binner,
+              cudaStream_t stream)
+{
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)gmemHistKernel<DataT, BinnerOp, IdxT, VecLen>);
+  gmemHistKernel<DataT, BinnerOp, IdxT, VecLen>
+    <<<blks, ThreadsPerBlock, 0, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
+__global__ void smemHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  extern __shared__ unsigned sbins[];
+  for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
+    sbins[i] = 0;
+  }
+  __syncthreads();
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+#if __CUDA_ARCH__ < 700
+    raft::myAtomicAdd<unsigned int>(sbins + binId, 1);
+#else
+    if (UseMatchAny) {
+      auto amask  = __activemask();
+      auto mask   = __match_any_sync(amask, binId);
+      auto leader = __ffs(mask) - 1;
+      if (raft::laneId() == leader) {
+        raft::myAtomicAdd<unsigned int>(sbins + binId, __popc(mask));
+      }
+    } else {
+      raft::myAtomicAdd<unsigned int>(sbins + binId, 1);
+    }
+#endif  // __CUDA_ARCH__
+  };
+  IdxT col = blockIdx.y;
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  auto binOffset = col * nbins;
+  for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
+    auto val = sbins[i];
+    if (val > 0) { raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binOffset + i, val); }
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
+void smemHist(int* bins,
+              IdxT nbins,
+              const DataT* data,
+              IdxT nrows,
+              IdxT ncols,
+              BinnerOp binner,
+              cudaStream_t stream)
+{
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)smemHistKernel<DataT, BinnerOp, IdxT, VecLen, UseMatchAny>);
+  size_t smemSize = nbins * sizeof(unsigned);
+  smemHistKernel<DataT, BinnerOp, IdxT, VecLen, UseMatchAny>
+    <<<blks, ThreadsPerBlock, smemSize, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+template <unsigned _BIN_BITS>
+struct BitsInfo {
+  static unsigned const BIN_BITS  = _BIN_BITS;
+  static unsigned const WORD_BITS = sizeof(unsigned) * 8;
+  static unsigned const WORD_BINS = WORD_BITS / BIN_BITS;
+  static unsigned const BIN_MASK  = (1 << BIN_BITS) - 1;
+};
+
+template <unsigned BIN_BITS>
+DI void incrementBin(unsigned* sbins, int* bins, int nbins, int binId)
+{
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto iword    = binId / Bits::WORD_BINS;
+  auto ibin     = binId % Bits::WORD_BINS;
+  auto sh       = ibin * Bits::BIN_BITS;
+  auto old_word = atomicAdd(sbins + iword, unsigned(1 << sh));
+  auto new_word = old_word + unsigned(1 << sh);
+  if ((new_word >> sh & Bits::BIN_MASK) != 0) return;
+  // overflow
+  raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binId, Bits::BIN_MASK + 1);
+  for (int dbin = 1; ibin + dbin < Bits::WORD_BINS && binId + dbin < nbins; ++dbin) {
+    auto sh1 = (ibin + dbin) * Bits::BIN_BITS;
+    if ((new_word >> sh1 & Bits::BIN_MASK) == 0) {
+      // overflow
+      raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binId + dbin, Bits::BIN_MASK);
+    } else {
+      // correction
+      raft::myAtomicAdd(bins + binId + dbin, -1);
+      break;
+    }
+  }
+}
+
+template <>
+DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId)
+{
+  typedef BitsInfo<1> Bits;
+  auto iword    = binId / Bits::WORD_BITS;
+  auto sh       = binId % Bits::WORD_BITS;
+  auto old_word = atomicXor(sbins + iword, unsigned(1 << sh));
+  if ((old_word >> sh & 1) != 0) raft::myAtomicAdd(bins + binId, 2);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
+__global__ void smemBitsHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  extern __shared__ unsigned sbins[];
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto nwords = raft::ceildiv<int>(nbins, Bits::WORD_BINS);
+  for (auto j = threadIdx.x; j < nwords; j += blockDim.x) {
+    sbins[j] = 0;
+  }
+  __syncthreads();
+  IdxT col       = blockIdx.y;
+  IdxT binOffset = col * nbins;
+  auto op        = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+    incrementBin<Bits::BIN_BITS>(sbins, bins + binOffset, (int)nbins, binId);
+  };
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  for (auto j = threadIdx.x; j < (int)nbins; j += blockDim.x) {
+    auto shift = j % Bits::WORD_BINS * Bits::BIN_BITS;
+    int count  = sbins[j / Bits::WORD_BINS] >> shift & Bits::BIN_MASK;
+    if (count > 0) raft::myAtomicAdd(bins + binOffset + j, count);
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
+void smemBitsHist(int* bins,
+                  IdxT nbins,
+                  const DataT* data,
+                  IdxT nrows,
+                  IdxT ncols,
+                  BinnerOp binner,
+                  cudaStream_t stream)
+{
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)smemBitsHistKernel<DataT, BinnerOp, IdxT, Bits::BIN_BITS, VecLen>);
+  size_t smemSize = raft::ceildiv<size_t>(nbins, Bits::WORD_BITS / Bits::BIN_BITS) * sizeof(int);
+  smemBitsHistKernel<DataT, BinnerOp, IdxT, Bits::BIN_BITS, VecLen>
+    <<<blks, ThreadsPerBlock, smemSize, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+#define INVALID_KEY -1
+
+DI void clearHashTable(int2* ht, int hashSize)
+{
+  for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) {
+    ht[i] = {INVALID_KEY, 0};
+  }
+}
+
+DI int findEntry(int2* ht, int hashSize, int binId, int threshold)
+{
+  int idx = binId % hashSize;
+  int t;
+  int count = 0;
+  while ((t = atomicCAS(&(ht[idx].x), INVALID_KEY, binId)) != INVALID_KEY && t != binId) {
+    ++count;
+    if (count >= threshold) {
+      idx = INVALID_KEY;
+      break;
+    }
+    ++idx;
+    if (idx >= hashSize) { idx = 0; }
+  }
+  return idx;
+}
+
+DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col)
+{
+  int binOffset = col * nbins;
+  for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) {
+    if (ht[i].x != INVALID_KEY && ht[i].y > 0) {
+      raft::myAtomicAdd(bins + binOffset + ht[i].x, ht[i].y);
+    }
+    ht[i] = {INVALID_KEY, 0};
+  }
+}
+
+#undef INVALID_KEY
+
+///@todo: honor VecLen template param
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+__global__ void smemHashHistKernel(int* bins,
+                                   const DataT* data,
+                                   IdxT nrows,
+                                   IdxT nbins,
+                                   BinnerOp binner,
+                                   int hashSize,
+                                   int threshold)
+{
+  extern __shared__ int2 ht[];
+  int* needFlush = (int*)&(ht[hashSize]);
+  if (threadIdx.x == 0) { needFlush[0] = 0; }
+  clearHashTable(ht, hashSize);
+  __syncthreads();
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    bool iNeedFlush = false;
+    if (row < nrows) {
+      int hidx = findEntry(ht, hashSize, binId, threshold);
+      if (hidx >= 0) {
+        raft::myAtomicAdd(&(ht[hidx].y), 1);
+      } else {
+        needFlush[0] = 1;
+        iNeedFlush   = true;
+      }
+    }
+    __syncthreads();
+    if (needFlush[0]) {
+      flushHashTable(ht, hashSize, bins, nbins, col);
+      __syncthreads();
+      if (threadIdx.x == 0) { needFlush[0] = 0; }
+      __syncthreads();
+    }
+    if (iNeedFlush) {
+      int hidx = findEntry(ht, hashSize, binId, threshold);
+      // all threads are bound to get one valid entry as all threads in this
+      // block will make forward progress due to the __syncthreads call in the
+      // subsequent iteration
+      raft::myAtomicAdd(&(ht[hidx].y), 1);
+    }
+  };
+  IdxT col = blockIdx.y;
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  flushHashTable(ht, hashSize, bins, nbins, col);
+}
+
+inline int computeHashTableSize()
+{
+  // we shouldn't have this much of shared memory available anytime soon!
+  static const unsigned maxBinsEverPossible = 256 * 1024;
+  static raft::common::Seive primes(maxBinsEverPossible);
+  unsigned smem = raft::getSharedMemPerBlock();
+  // divide-by-2 because hash table entry stores 2 elements: idx and count
+  auto binsPossible = smem / sizeof(unsigned) / 2;
+  for (; binsPossible > 1; --binsPossible) {
+    if (primes.isPrime(binsPossible)) return (int)binsPossible;
+  }
+  return 1;  // should not happen!
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void smemHashHist(int* bins,
+                  IdxT nbins,
+                  const DataT* data,
+                  IdxT nrows,
+                  IdxT ncols,
+                  BinnerOp binner,
+                  cudaStream_t stream)
+{
+  static const int flushThreshold = 10;
+  auto blks                       = computeGridDim<IdxT, 1>(
+    nrows, ncols, (const void*)smemHashHistKernel<DataT, BinnerOp, IdxT, 1>);
+  int hashSize    = computeHashTableSize();
+  size_t smemSize = hashSize * sizeof(int2) + sizeof(int);
+  smemHashHistKernel<DataT, BinnerOp, IdxT, 1><<<blks, ThreadsPerBlock, smemSize, stream>>>(
+    bins, data, nrows, nbins, binner, hashSize, flushThreshold);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void histogramVecLen(HistType type,
+                     int* bins,
+                     IdxT nbins,
+                     const DataT* data,
+                     IdxT nrows,
+                     IdxT ncols,
+                     cudaStream_t stream,
+                     BinnerOp binner)
+{
+  RAFT_CUDA_TRY(cudaMemsetAsync(bins, 0, ncols * nbins * sizeof(int), stream));
+  switch (type) {
+    case HistTypeGmem:
+      gmemHist<DataT, BinnerOp, IdxT, VecLen>(bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmem:
+      smemHist<DataT, BinnerOp, IdxT, VecLen, false>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemMatchAny:
+      smemHist<DataT, BinnerOp, IdxT, VecLen, true>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits16:
+      smemBitsHist<DataT, BinnerOp, IdxT, 16, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits8:
+      smemBitsHist<DataT, BinnerOp, IdxT, 8, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits4:
+      smemBitsHist<DataT, BinnerOp, IdxT, 4, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits2:
+      smemBitsHist<DataT, BinnerOp, IdxT, 2, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits1:
+      smemBitsHist<DataT, BinnerOp, IdxT, 1, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemHash:
+      smemHashHist<DataT, BinnerOp, IdxT, VecLen>(bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    default: ASSERT(false, "histogram: Invalid type passed '%d'!", type);
+  };
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT>
+void histogramImpl(HistType type,
+                   int* bins,
+                   IdxT nbins,
+                   const DataT* data,
+                   IdxT nrows,
+                   IdxT ncols,
+                   cudaStream_t stream,
+                   BinnerOp binner)
+{
+  size_t bytes = nrows * sizeof(DataT);
+  if (nrows <= 0) return;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 16 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 8 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (4 % sizeof(DataT) == 0 && bytes % 4 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 4 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (2 % sizeof(DataT) == 0 && bytes % 2 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 2 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else {
+    histogramVecLen<DataT, BinnerOp, IdxT, 1>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  }
+}
+
+template <typename IdxT>
+HistType selectBestHistAlgo(IdxT nbins)
+{
+  size_t smem         = raft::getSharedMemPerBlock();
+  size_t requiredSize = nbins * sizeof(unsigned);
+  if (requiredSize <= smem) { return HistTypeSmem; }
+  for (int bits = 16; bits >= 1; bits >>= 1) {
+    auto nBytesForBins = raft::ceildiv<size_t>(bits * nbins, 8);
+    requiredSize       = raft::alignTo<size_t>(nBytesForBins, sizeof(unsigned));
+    if (requiredSize <= smem) { return static_cast<HistType>(bits); }
+  }
+  return HistTypeGmem;
+}
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  HistType computedType = type;
+  if (type == HistTypeAuto) { computedType = selectBestHistAlgo(nbins); }
+  histogramImpl<DataT, BinnerOp, IdxT>(
+    computedType, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh
new file mode 100644
index 0000000000..2a4a9bff93
--- /dev/null
+++ b/cpp/include/raft/stats/detail/minmax.cuh
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+// TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it
+template <class To, class From>
+constexpr To bit_cast(const From& from) noexcept
+{
+  To to{};
+  static_assert(sizeof(To) == sizeof(From));
+  memcpy(&to, &from, sizeof(To));
+  return to;
+}
+
+template <typename T>
+struct encode_traits {
+};
+
+template <>
+struct encode_traits<float> {
+  using E = int;
+};
+
+template <>
+struct encode_traits<double> {
+  using E = long long;
+};
+
+HDI int encode(float val)
+{
+  int i = detail::bit_cast<int>(val);
+  return i >= 0 ? i : (1 << 31) | ~i;
+}
+
+HDI long long encode(double val)
+{
+  std::int64_t i = detail::bit_cast<std::int64_t>(val);
+  return i >= 0 ? i : (1ULL << 63) | ~i;
+}
+
+HDI float decode(int val)
+{
+  if (val < 0) val = (1 << 31) | ~val;
+  return detail::bit_cast<float>(val);
+}
+
+HDI double decode(long long val)
+{
+  if (val < 0) val = (1ULL << 63) | ~val;
+  return detail::bit_cast<double>(val);
+}
+
+template <typename T, typename E>
+DI T atomicMaxBits(T* address, T val)
+{
+  E old = atomicMax((E*)address, encode(val));
+  return decode(old);
+}
+
+template <typename T, typename E>
+DI T atomicMinBits(T* address, T val)
+{
+  E old = atomicMin((E*)address, encode(val));
+  return decode(old);
+}
+
+template <typename T, typename E>
+__global__ void decodeKernel(T* globalmin, T* globalmax, int ncols)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < ncols) {
+    globalmin[tid] = decode(*(E*)&globalmin[tid]);
+    globalmax[tid] = decode(*(E*)&globalmax[tid]);
+  }
+}
+
+///@todo: implement a proper "fill" kernel
+template <typename T, typename E>
+__global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= ncols) return;
+  *(E*)&globalmin[tid] = encode(init_val);
+  *(E*)&globalmax[tid] = encode(-init_val);
+}
+
+template <typename T, typename E>
+__global__ void minmaxKernel(const T* data,
+                             const unsigned int* rowids,
+                             const unsigned int* colids,
+                             int nrows,
+                             int ncols,
+                             int row_stride,
+                             T* g_min,
+                             T* g_max,
+                             T* sampledcols,
+                             T init_min_val,
+                             int batch_ncols,
+                             int num_batches)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ char shmem[];
+  T* s_min = (T*)shmem;
+  T* s_max = (T*)(shmem + sizeof(T) * batch_ncols);
+
+  int last_batch_ncols = ncols % batch_ncols;
+  if (last_batch_ncols == 0) { last_batch_ncols = batch_ncols; }
+  int orig_batch_ncols = batch_ncols;
+
+  for (int batch_id = 0; batch_id < num_batches; batch_id++) {
+    if (batch_id == num_batches - 1) { batch_ncols = last_batch_ncols; }
+
+    for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) {
+      *(E*)&s_min[i] = encode(init_min_val);
+      *(E*)&s_max[i] = encode(-init_min_val);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < nrows * batch_ncols; i += blockDim.x * gridDim.x) {
+      int col = (batch_id * orig_batch_ncols) + (i / nrows);
+      int row = i % nrows;
+      if (colids != nullptr) { col = colids[col]; }
+      if (rowids != nullptr) { row = rowids[row]; }
+      int index = row + col * row_stride;
+      T coldata = data[index];
+      if (!isnan(coldata)) {
+        // Min max values are saved in shared memory and global memory as per the shuffled colids.
+        atomicMinBits<T, E>(&s_min[(int)(i / nrows)], coldata);
+        atomicMaxBits<T, E>(&s_max[(int)(i / nrows)], coldata);
+      }
+      if (sampledcols != nullptr) { sampledcols[batch_id * orig_batch_ncols + i] = coldata; }
+    }
+    __syncthreads();
+
+    // finally, perform global mem atomics
+    for (int j = threadIdx.x; j < batch_ncols; j += blockDim.x) {
+      atomicMinBits<T, E>(&g_min[batch_id * orig_batch_ncols + j], decode(*(E*)&s_min[j]));
+      atomicMaxBits<T, E>(&g_max[batch_id * orig_batch_ncols + j], decode(*(E*)&s_max[j]));
+    }
+    __syncthreads();
+  }
+}
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  using E    = typename encode_traits<T>::E;
+  int nblks  = raft::ceildiv(ncols, TPB);
+  T init_val = std::numeric_limits<T>::max();
+  minmaxInitKernel<T, E><<<nblks, TPB, 0, stream>>>(ncols, globalmin, globalmax, init_val);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  nblks           = raft::ceildiv(nrows * ncols, TPB);
+  nblks           = min(nblks, 65536);
+  size_t smemSize = sizeof(T) * 2 * ncols;
+
+  // Compute the batch_ncols, in [1, ncols] range, that meet the available
+  // shared memory constraints.
+  auto smemPerBlk = raft::getSharedMemPerBlock();
+  int batch_ncols = min(ncols, (int)(smemPerBlk / (sizeof(T) * 2)));
+  int num_batches = raft::ceildiv(ncols, batch_ncols);
+  smemSize        = sizeof(T) * 2 * batch_ncols;
+
+  minmaxKernel<T, E><<<nblks, TPB, smemSize, stream>>>(data,
+                                                       rowids,
+                                                       colids,
+                                                       nrows,
+                                                       ncols,
+                                                       row_stride,
+                                                       globalmin,
+                                                       globalmax,
+                                                       sampledcols,
+                                                       init_val,
+                                                       batch_ncols,
+                                                       num_batches);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  decodeKernel<T, E><<<nblks, TPB, 0, stream>>>(globalmin, globalmax, ncols);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
new file mode 100644
index 0000000000..ca7fc136d3
--- /dev/null
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  // sum the weights & copy back to CPU
+  Type WS = 0;
+  raft::linalg::coalescedReduction(mu, weights, D, 1, (Type)0, stream, false);
+  raft::update_host(&WS, mu, 1, stream);
+
+  raft::linalg::coalescedReduction(
+    mu,
+    data,
+    D,
+    N,
+    (Type)0,
+    stream,
+    false,
+    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [] __device__(Type a, Type b) { return a + b; },
+    [WS] __device__(Type v) { return v / WS; });
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  // sum the weights & copy back to CPU
+  Type WS = 0;
+  raft::linalg::stridedReduction(mu, weights, 1, N, (Type)0, stream, false);
+  raft::update_host(&WS, mu, 1, stream);
+
+  raft::linalg::stridedReduction(
+    mu,
+    data,
+    D,
+    N,
+    (Type)0,
+    stream,
+    false,
+    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [] __device__(Type a, Type b) { return a + b; },
+    [WS] __device__(Type v) { return v / WS; });
+}
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
new file mode 100644
index 0000000000..d4d3b449f7
--- /dev/null
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/histogram.cuh>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
new file mode 100644
index 0000000000..966287bb41
--- /dev/null
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/detail/minmax.cuh>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  detail::minmax<T, TPB>(
+    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
new file mode 100644
index 0000000000..ad90142a08
--- /dev/null
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/weighted_mean.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::colWeightedMean(mu, data, weights, D, N, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 654ab73f84..430b69341c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -16,6 +16,7 @@
 
 # keep the files in alphabetical order!
 add_executable(test_raft
+    test/common/seive.cu
     test/cudart_utils.cpp
     test/cluster_solvers.cu
     test/distance/dist_adj.cu
@@ -106,14 +107,19 @@ add_executable(test_raft
     test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
     test/spatial/ball_cover.cu
+    test/spatial/epsilon_neighborhood.cu
     test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
+    test/stats/cov.cu
+    test/stats/histogram.cu
     test/stats/mean.cu
     test/stats/meanvar.cu
     test/stats/mean_center.cu
+    test/stats/minmax.cu
     test/stats/stddev.cu
     test/stats/sum.cu
+    test/stats/weighted_mean.cu
     test/test.cpp
 )
 
diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu
new file mode 100644
index 0000000000..8044dbb532
--- /dev/null
+++ b/cpp/test/common/seive.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/common/seive.hpp>
+
+namespace raft {
+namespace common {
+TEST(Seive, Test)
+{
+  Seive s1(32);
+  ASSERT_TRUE(s1.isPrime(17));
+  ASSERT_FALSE(s1.isPrime(28));
+
+  Seive s2(1024 * 1024);
+  ASSERT_TRUE(s2.isPrime(107));
+  ASSERT_FALSE(s2.isPrime(111));
+  ASSERT_TRUE(s2.isPrime(6047));
+}
+
+}  // end namespace common
+}  // end namespace raft
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index b8e44580b5..da38464bf7 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -31,6 +31,7 @@ struct RsvdInputs {
   T tolerance;
   int n_row;
   int n_col;
+  float redundancy;
   T PC_perc;
   T UpS_perc;
   int k;
@@ -66,7 +67,7 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
     params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
     // rSVD seems to be very sensitive to the random number sequence as well!
-    raft::random::Rng r(params.seed, raft::random::GenTaps);
+    raft::random::Rng r(params.seed, raft::random::GenPC);
     int m = params.n_row, n = params.n_col;
     T eig_svd_tol  = 1.e-7;
     int max_sweeps = 100;
@@ -91,8 +92,19 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
       raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
       raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
 
-    } else {  // Other normal tests
-      r.normal(A.data(), m * n, mu, sigma, stream);
+    } else {                                 // Other normal tests
+      int n_informative   = int(0.25f * n);  // Informative cols
+      int len_informative = m * n_informative;
+
+      int n_redundant   = n - n_informative;  // Redundant cols
+      int len_redundant = m * n_redundant;
+
+      r.normal(A.data(), len_informative, mu, sigma, stream);
+      CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative,
+                                 A.data(),
+                                 len_redundant * sizeof(T),
+                                 cudaMemcpyDeviceToDevice,
+                                 stream));
     }
     std::vector<T> A_backup_cpu(m *
                                 n);  // Backup A matrix as svdJacobi will destroy the content of A
@@ -157,59 +169,65 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
 const std::vector<RsvdInputs<float>> inputs_fx = {
   // Test with ratios
-  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-  ,                                                         // Test with fixed ranks
-  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
-  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
-  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
-  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
-  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
-  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Tall + non-BBT
+
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                              // Test with fixed ranks
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Square + non-BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Tall + non-BBT
+
+  {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+
+  {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
 };
 
 const std::vector<RsvdInputs<double>> inputs_dx = {
   // Test with ratios
-  {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-  ,                                                      // Test with fixed ranks
-  {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
-  {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
-  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
-  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
-  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
-  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                             // Test with fixed ranks
+  {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
 };
 
 const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
-  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
+  {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
 
 const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
-  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
+  {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, false, 4321ULL}};
 
 typedef RsvdTest<float> RsvdSanityCheckValF;
 TEST_P(RsvdSanityCheckValF, Result)
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 8c7e440d0e..caad627d49 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -21,7 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/random/make_blobs.hpp>
 
-namespace raft::random {
+namespace raft {
+namespace random {
 
 template <typename T>
 __global__ void meanKernel(T* out,
@@ -136,8 +137,8 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
   {
     int len      = params.n_clusters * params.cols;
     auto compare = raft::CompareApprox<T>(num_sigma * params.tolerance);
-    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream));
-    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream));
+    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare));
+    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare));
   }
 
  protected:
@@ -153,53 +154,37 @@ typedef MakeBlobsTest<float> MakeBlobsTestF;
 const std::vector<MakeBlobsInputs<float>> inputsf_t = {
   {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
 };
 
 TEST_P(MakeBlobsTestF, Result) { check(); }
@@ -209,55 +194,40 @@ typedef MakeBlobsTest<double> MakeBlobsTestD;
 const std::vector<MakeBlobsInputs<double>> inputsd_t = {
   {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
 
   {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
 };
 TEST_P(MakeBlobsTestD, Result) { check(); }
 INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
new file mode 100644
index 0000000000..33af5726a0
--- /dev/null
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <raft/cudart_utils.h>
+#include <raft/random/make_blobs.hpp>
+#include <raft/spatial/knn/epsilon_neighborhood.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+template <typename T, typename IdxT>
+struct EpsInputs {
+  IdxT n_row, n_col, n_centers, n_batches;
+  T eps;
+};
+
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const EpsInputs<T, IdxT>& p)
+{
+  return os;
+}
+
+template <typename T, typename IdxT>
+class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
+ protected:
+  EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {}
+
+  void SetUp() override
+  {
+    param = ::testing::TestWithParam<EpsInputs<T, IdxT>>::GetParam();
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    data.resize(param.n_row * param.n_col, stream);
+    labels.resize(param.n_row, stream);
+    batchSize = param.n_row / param.n_batches;
+    adj.resize(param.n_row * batchSize, stream);
+    vd.resize(batchSize + 1, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream));
+    random::make_blobs<T, IdxT>(data.data(),
+                                labels.data(),
+                                param.n_row,
+                                param.n_col,
+                                param.n_centers,
+                                stream,
+                                true,
+                                nullptr,
+                                nullptr,
+                                T(0.01),
+                                false);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  EpsInputs<T, IdxT> param;
+  cudaStream_t stream = 0;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<bool> adj;
+  rmm::device_uvector<IdxT> labels, vd;
+  IdxT batchSize;
+};  // class EpsNeighTest
+
+const std::vector<EpsInputs<float, int>> inputsfi = {
+  {15000, 16, 5, 1, 2.f},
+  {14000, 16, 5, 1, 2.f},
+  {15000, 17, 5, 1, 2.f},
+  {14000, 17, 5, 1, 2.f},
+  {15000, 18, 5, 1, 2.f},
+  {14000, 18, 5, 1, 2.f},
+  {15000, 32, 5, 1, 2.f},
+  {14000, 32, 5, 1, 2.f},
+  {20000, 10000, 10, 1, 2.f},
+  {20000, 10000, 10, 2, 2.f},
+};
+typedef EpsNeighTest<float, int> EpsNeighTestFI;
+TEST_P(EpsNeighTestFI, Result)
+{
+  for (int i = 0; i < param.n_batches; ++i) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream));
+    epsUnexpL2SqNeighborhood<float, int>(adj.
+
+                                         data(),
+                                         vd
+
+                                           .
+
+                                         data(),
+                                         data
+
+                                           .
+
+                                         data(),
+                                         data
+
+                                             .
+
+                                           data()
+
+                                           + (i * batchSize * param.n_col),
+                                         param.n_row,
+                                         batchSize,
+                                         param.n_col,
+                                         param.eps * param.eps,
+                                         stream);
+    ASSERT_TRUE(raft::devArrMatch(
+      param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare<int>(), stream));
+  }
+}
+INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi));
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
new file mode 100644
index 0000000000..2db64a7999
--- /dev/null
+++ b/cpp/test/stats/cov.cu
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+#include <raft/stats/cov.hpp>
+#include <raft/stats/mean.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct CovInputs {
+  T tolerance, mean, var;
+  int rows, cols;
+  bool sample, rowMajor, stable;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const CovInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
+ protected:
+  CovTest()
+    : data(0, stream),
+      mean_act(0, stream),
+      cov_act(0, stream),
+      cov_cm(0, stream),
+      cov_cm_ref(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::handle_t handle;
+    cudaStream_t stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<CovInputs<T>>::GetParam();
+    params.tolerance *= 2;
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    auto len = rows * cols;
+    T var    = params.var;
+    data.resize(len, stream);
+    mean_act.resize(cols, stream);
+    cov_act.resize(cols * cols, stream);
+
+    r.normal(data.data(), len, params.mean, var, stream);
+    raft::stats::mean(
+      mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    cov(handle,
+        cov_act.data(),
+        data.data(),
+        mean_act.data(),
+        cols,
+        rows,
+        params.sample,
+        params.rowMajor,
+        params.stable,
+        stream);
+
+    T data_h[6]       = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0};
+    T cov_cm_ref_h[4] = {4.3333, -2.8333, -2.8333, 2.333};
+
+    cov_cm.resize(4, stream);
+    cov_cm_ref.resize(4, stream);
+    rmm::device_uvector<T> data_cm(6, stream);
+    rmm::device_uvector<T> mean_cm(2, stream);
+
+    raft::update_device(data_cm.data(), data_h, 6, stream);
+    raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
+
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream);
+    cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
+  }
+
+ protected:
+  CovInputs<T> params;
+  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
+  cublasHandle_t handle;
+  cudaStream_t stream = 0;
+};
+
+///@todo: add stable=false after it has been implemented
+const std::vector<CovInputs<float>> inputsf = {
+  {0.03f, 1.f, 2.f, 32 * 1024, 32, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 64, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 128, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 256, true, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 32, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 64, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 128, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 256, false, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 32, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 64, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 128, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 256, true, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 32, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 64, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 128, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 256, false, true, true, 1234ULL}};
+
+const std::vector<CovInputs<double>> inputsd = {
+  {0.03, 1.0, 2.0, 32 * 1024, 32, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 64, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 128, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 256, true, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 32, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 64, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 128, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 256, false, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 32, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 64, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 128, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 256, true, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 32, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 64, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 128, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 256, false, true, true, 1234ULL}};
+
+typedef CovTest<float> CovTestF;
+TEST_P(CovTestF, Result)
+{
+  ASSERT_TRUE(raft::diagonalMatch(params.var * params.var,
+                                  cov_act.data(),
+                                  params.cols,
+                                  params.cols,
+                                  raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef CovTest<double> CovTestD;
+TEST_P(CovTestD, Result)
+{
+  ASSERT_TRUE(raft::diagonalMatch(params.var * params.var,
+                                  cov_act.data(),
+                                  params.cols,
+                                  params.cols,
+                                  raft::CompareApprox<double>(params.tolerance)));
+}
+
+typedef CovTest<float> CovTestSmallF;
+TEST_P(CovTestSmallF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef CovTest<double> CovTestSmallD;
+TEST_P(CovTestSmallD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestD, ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallD, ::testing::ValuesIn(inputsd));
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
new file mode 100644
index 0000000000..ff538fcdca
--- /dev/null
+++ b/cpp/test/stats/histogram.cu
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/histogram.hpp>
+
+namespace raft {
+namespace stats {
+
+// Note: this kernel also updates the input vector to take care of OOB bins!
+__global__ void naiveHistKernel(int* bins, int nbins, int* in, int nrows)
+{
+  int tid        = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride     = blockDim.x * gridDim.x;
+  auto offset    = blockIdx.y * nrows;
+  auto binOffset = blockIdx.y * nbins;
+  for (; tid < nrows; tid += stride) {
+    int id = in[offset + tid];
+    if (id < 0)
+      id = 0;
+    else if (id >= nbins)
+      id = nbins - 1;
+    in[offset + tid] = id;
+    raft::myAtomicAdd(bins + binOffset + id, 1);
+  }
+}
+
+void naiveHist(int* bins, int nbins, int* in, int nrows, int ncols, cudaStream_t stream)
+{
+  const int TPB = 128;
+  int nblksx    = raft::ceildiv(nrows, TPB);
+  dim3 blks(nblksx, ncols);
+  naiveHistKernel<<<blks, TPB, 0, stream>>>(bins, nbins, in, nrows);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+struct HistInputs {
+  int nrows, ncols, nbins;
+  bool isNormal;
+  HistType type;
+  int start, end;
+  unsigned long long int seed;
+};
+
+class HistTest : public ::testing::TestWithParam<HistInputs> {
+ protected:
+  HistTest() : in(0, stream), bins(0, stream), ref_bins(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<HistInputs>::GetParam();
+    raft::random::Rng r(params.seed);
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    int len = params.nrows * params.ncols;
+    in.resize(len, stream);
+    if (params.isNormal) {
+      r.normalInt(in.data(), len, params.start, params.end, stream);
+    } else {
+      r.uniformInt(in.data(), len, params.start, params.end, stream);
+    }
+    bins.resize(params.nbins * params.ncols, stream);
+    ref_bins.resize(params.nbins * params.ncols, stream);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(ref_bins.data(), 0, sizeof(int) * params.nbins * params.ncols, stream));
+    naiveHist(ref_bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream);
+    histogram<int>(
+      params.type, bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream);
+    raft::interruptible::synchronize(stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  cudaStream_t stream = 0;
+  HistInputs params;
+  rmm::device_uvector<int> in, bins, ref_bins;
+};
+
+static const int oneK                = 1024;
+static const int oneM                = oneK * oneK;
+const std::vector<HistInputs> inputs = {
+  {oneM, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+};
+TEST_P(HistTest, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    ref_bins.data(), bins.data(), params.nbins * params.ncols, raft::Compare<int>()));
+}
+INSTANTIATE_TEST_CASE_P(HistTests, HistTest, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
new file mode 100644
index 0000000000..61b16b65ae
--- /dev/null
+++ b/cpp/test/stats/minmax.cu
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+#include <raft/stats/minmax.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace raft {
+namespace stats {
+
+///@todo: need to add tests for verifying the column subsampling feature
+
+template <typename T>
+struct MinMaxInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const MinMaxInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+__global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= ncols) return;
+  globalmin[tid] = init_val;
+  globalmax[tid] = -init_val;
+}
+
+template <typename T>
+__global__ void naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int col = tid / nrows;
+  if (col < ncols) {
+    T val = data[tid];
+    if (!isnan(val)) {
+      raft::myAtomicMin(&globalmin[col], val);
+      raft::myAtomicMax(&globalmax[col], val);
+    }
+  }
+}
+
+template <typename T>
+void naiveMinMax(
+  const T* data, int nrows, int ncols, T* globalmin, T* globalmax, cudaStream_t stream)
+{
+  const int TPB = 128;
+  int nblks     = raft::ceildiv(ncols, TPB);
+  T init_val    = std::numeric_limits<T>::max();
+  naiveMinMaxInitKernel<<<nblks, TPB, 0, stream>>>(ncols, globalmin, globalmax, init_val);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  nblks = raft::ceildiv(nrows * ncols, TPB);
+  naiveMinMaxKernel<<<nblks, TPB, 0, stream>>>(data, nrows, ncols, globalmin, globalmax);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void nanKernel(T* data, const bool* mask, int len, T nan)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= len) return;
+  if (!mask[tid]) data[tid] = nan;
+}
+
+template <typename T>
+class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
+ protected:
+  MinMaxTest() : minmax_act(0, stream), minmax_ref(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<MinMaxInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.rows * params.cols;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> data(len, stream);
+    rmm::device_uvector<bool> mask(len, stream);
+    minmax_act.resize(2 * params.cols, stream);
+    minmax_ref.resize(2 * params.cols, stream);
+
+    r.normal(data.data(), len, (T)0.0, (T)1.0, stream);
+    T nan_prob = 0.01;
+    r.bernoulli(mask.data(), len, nan_prob, stream);
+    const int TPB = 256;
+    nanKernel<<<raft::ceildiv(len, TPB), TPB, 0, stream>>>(
+      data.data(), mask.data(), len, std::numeric_limits<T>::quiet_NaN());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    naiveMinMax(data.data(),
+                params.rows,
+                params.cols,
+                minmax_ref.data(),
+                minmax_ref.data() + params.cols,
+                stream);
+    minmax<T, 512>(data.data(),
+                   nullptr,
+                   nullptr,
+                   params.rows,
+                   params.cols,
+                   params.rows,
+                   minmax_act.data(),
+                   minmax_act.data() + params.cols,
+                   nullptr,
+                   stream);
+  }
+
+ protected:
+  MinMaxInputs<T> params;
+  rmm::device_uvector<T> minmax_act;
+  rmm::device_uvector<T> minmax_ref;
+  cudaStream_t stream = 0;
+};
+
+const std::vector<MinMaxInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
+                                                  {0.00001f, 1024, 64, 1234ULL},
+                                                  {0.00001f, 1024, 128, 1234ULL},
+                                                  {0.00001f, 1024, 256, 1234ULL},
+                                                  {0.00001f, 1024, 512, 1234ULL},
+                                                  {0.00001f, 1024, 1024, 1234ULL},
+                                                  {0.00001f, 4096, 32, 1234ULL},
+                                                  {0.00001f, 4096, 64, 1234ULL},
+                                                  {0.00001f, 4096, 128, 1234ULL},
+                                                  {0.00001f, 4096, 256, 1234ULL},
+                                                  {0.00001f, 4096, 512, 1234ULL},
+                                                  {0.00001f, 4096, 1024, 1234ULL},
+                                                  {0.00001f, 8192, 32, 1234ULL},
+                                                  {0.00001f, 8192, 64, 1234ULL},
+                                                  {0.00001f, 8192, 128, 1234ULL},
+                                                  {0.00001f, 8192, 256, 1234ULL},
+                                                  {0.00001f, 8192, 512, 1234ULL},
+                                                  {0.00001f, 8192, 1024, 1234ULL},
+                                                  {0.00001f, 1024, 8192, 1234ULL}};
+
+const std::vector<MinMaxInputs<double>> inputsd = {{0.0000001, 1024, 32, 1234ULL},
+                                                   {0.0000001, 1024, 64, 1234ULL},
+                                                   {0.0000001, 1024, 128, 1234ULL},
+                                                   {0.0000001, 1024, 256, 1234ULL},
+                                                   {0.0000001, 1024, 512, 1234ULL},
+                                                   {0.0000001, 1024, 1024, 1234ULL},
+                                                   {0.0000001, 4096, 32, 1234ULL},
+                                                   {0.0000001, 4096, 64, 1234ULL},
+                                                   {0.0000001, 4096, 128, 1234ULL},
+                                                   {0.0000001, 4096, 256, 1234ULL},
+                                                   {0.0000001, 4096, 512, 1234ULL},
+                                                   {0.0000001, 4096, 1024, 1234ULL},
+                                                   {0.0000001, 8192, 32, 1234ULL},
+                                                   {0.0000001, 8192, 64, 1234ULL},
+                                                   {0.0000001, 8192, 128, 1234ULL},
+                                                   {0.0000001, 8192, 256, 1234ULL},
+                                                   {0.0000001, 8192, 512, 1234ULL},
+                                                   {0.0000001, 8192, 1024, 1234ULL},
+                                                   {0.0000001, 1024, 8192, 1234ULL}};
+
+typedef MinMaxTest<float> MinMaxTestF;
+TEST_P(MinMaxTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(),
+                                minmax_act.data(),
+                                2 * params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef MinMaxTest<double> MinMaxTestD;
+TEST_P(MinMaxTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(),
+                                minmax_act.data(),
+                                2 * params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
new file mode 100644
index 0000000000..ee58747b69
--- /dev/null
+++ b/cpp/test/stats/weighted_mean.cu
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/weighted_mean.hpp>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct WeightedMeanInputs {
+  T tolerance;
+  int M, N;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const WeightedMeanInputs<T>& I)
+{
+  return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << "}"
+            << std::endl;
+}
+
+///// weighted row-wise mean test and support functions
+template <typename T>
+void naiveRowWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor)
+{
+  int istr = rowMajor ? 1 : M;
+  int jstr = rowMajor ? N : 1;
+
+  // sum the weights
+  T WS = 0;
+  for (int i = 0; i < N; i++)
+    WS += W[i];
+
+  for (int j = 0; j < M; j++) {
+    R[j] = (T)0;
+    for (int i = 0; i < N; i++) {
+      // R[j] += (W[i]*D[i*istr + j*jstr] - R[j])/(T)(i+1);
+      R[j] += (W[i] * D[i * istr + j * jstr]) / WS;
+    }
+  }
+}
+
+template <typename T>
+class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+ protected:
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(cols);
+    dexp.resize(rows);
+    dact.resize(rows);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), cols, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(rows);
+
+    // compute naive result & copy to GPU
+    naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    dexp = hexp;
+
+    // compute ml-prims result
+    rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.N;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
+///// weighted column-wise mean test and support functions
+template <typename T>
+void naiveColWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor)
+{
+  int istr = rowMajor ? 1 : M;
+  int jstr = rowMajor ? N : 1;
+
+  // sum the weights
+  T WS = 0;
+  for (int j = 0; j < M; j++)
+    WS += W[j];
+
+  for (int i = 0; i < N; i++) {
+    R[i] = (T)0;
+    for (int j = 0; j < M; j++) {
+      // R[i] += (W[j]*D[i*istr + j*jstr] - R[i])/(T)(j+1);
+      R[i] += (W[j] * D[i * istr + j * jstr]) / WS;
+    }
+  }
+}
+
+template <typename T>
+class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(rows);
+    dexp.resize(cols);
+    dact.resize(cols);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), rows, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(cols);
+
+    // compute naive result & copy to GPU
+    naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    dexp = hexp;
+
+    // compute ml-prims result
+    colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.M;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
+////// Parameter sets and test instantiation
+static const float tolF  = 128 * std::numeric_limits<float>::epsilon();
+static const double tolD = 256 * std::numeric_limits<double>::epsilon();
+
+const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234},
+                                                        {tolF, 1024, 32, 1234},
+                                                        {tolF, 1024, 64, 1234},
+                                                        {tolF, 1024, 128, 1234},
+                                                        {tolF, 1024, 256, 1234},
+                                                        {tolF, 1024, 32, 1234},
+                                                        {tolF, 1024, 64, 1234},
+                                                        {tolF, 1024, 128, 1234},
+                                                        {tolF, 1024, 256, 1234}};
+
+const std::vector<WeightedMeanInputs<double>> inputsd = {{tolD, 4, 4, 1234},
+                                                         {tolD, 1024, 32, 1234},
+                                                         {tolD, 1024, 64, 1234},
+                                                         {tolD, 1024, 128, 1234},
+                                                         {tolD, 1024, 256, 1234},
+                                                         {tolD, 1024, 32, 1234},
+                                                         {tolD, 1024, 64, 1234},
+                                                         {tolD, 1024, 128, 1234},
+                                                         {tolD, 1024, 256, 1234}};
+
+using RowWeightedMeanTestF = RowWeightedMeanTest<float>;
+TEST_P(RowWeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using RowWeightedMeanTestD = RowWeightedMeanTest<double>;
+TEST_P(RowWeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
+using ColWeightedMeanTestF = ColWeightedMeanTest<float>;
+TEST_P(ColWeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using ColWeightedMeanTestD = ColWeightedMeanTest<double>;
+TEST_P(ColWeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
+};  // end namespace stats
+};  // end namespace raft