diff --git a/cpp/include/raft/random/detail/curand_wrappers.hpp b/cpp/include/raft/random/detail/curand_wrappers.hpp
new file mode 100644
index 0000000000..969d739cc1
--- /dev/null
+++ b/cpp/include/raft/random/detail/curand_wrappers.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <curand.h>
+
+namespace raft::random {
+namespace detail {
+
+// @todo: We probably want to scrape through and replace any consumers of
+// these wrappers with our RNG
+/** check for curand runtime API errors and assert accordingly */
+#define CURAND_CHECK(call)                                                                         \
+  do {                                                                                             \
+    curandStatus_t status = call;                                                                  \
+    ASSERT(status == CURAND_STATUS_SUCCESS, "FAIL: curand-call='%s'. Reason:%d\n", #call, status); \
+  } while (0)
+
+/**
+ * @defgroup normal curand normal random number generation operations
+ * @{
+ */
+template <typename T>
+curandStatus_t curandGenerateNormal(
+  curandGenerator_t generator, T* outputPtr, size_t n, T mean, T stddev);
+
+template <>
+inline curandStatus_t curandGenerateNormal(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
+  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+}
+
+template <>
+inline curandStatus_t curandGenerateNormal(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
+  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
+}
+/** @} */
+
+};  // end namespace detail
+};  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
new file mode 100644
index 0000000000..528d20a284
--- /dev/null
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/unary_op.hpp>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+#include <vector>
+
+namespace raft::random {
+namespace detail {
+
+namespace {
+
+// generate the labels first and shuffle them instead of shuffling the dataset
+template <typename IdxT>
+void generate_labels(IdxT* labels,
+                     IdxT n_rows,
+                     IdxT n_clusters,
+                     bool shuffle,
+                     raft::random::Rng& r,
+                     cudaStream_t stream)
+{
+  IdxT a, b;
+  r.affine_transform_params(n_clusters, a, b);
+  auto op = [=] __device__(IdxT * ptr, IdxT idx) {
+    if (shuffle) { idx = IdxT((a * int64_t(idx)) + b); }
+    idx %= n_clusters;
+    // in the unlikely case of n_clusters > n_rows, make sure that the writes
+    // do not go out-of-bounds
+    if (idx < n_rows) { *ptr = idx; }
+  };
+  raft::linalg::writeOnlyUnaryOp<IdxT, decltype(op), IdxT>(labels, n_rows, op, stream);
+}
+
+template <typename DataT, typename IdxT>
+DI void get_mu_sigma(DataT& mu,
+                     DataT& sigma,
+                     IdxT idx,
+                     const IdxT* labels,
+                     bool row_major,
+                     const DataT* centers,
+                     const DataT* cluster_std,
+                     DataT cluster_std_scalar,
+                     IdxT n_rows,
+                     IdxT n_cols,
+                     IdxT n_clusters)
+{
+  IdxT cid, fid;
+  if (row_major) {
+    cid = idx / n_cols;
+    fid = idx % n_cols;
+  } else {
+    cid = idx % n_rows;
+    fid = idx / n_rows;
+  }
+  IdxT center_id;
+  if (cid < n_rows) {
+    center_id = labels[cid];
+  } else {
+    center_id = 0;
+  }
+
+  if (fid >= n_cols) { fid = 0; }
+
+  if (row_major) {
+    center_id = center_id * n_cols + fid;
+  } else {
+    center_id += fid * n_clusters;
+  }
+  sigma = cluster_std == nullptr ? cluster_std_scalar : cluster_std[cid];
+  mu    = centers[center_id];
+}
+
+template <typename DataT, typename IdxT>
+void generate_data(DataT* out,
+                   const IdxT* labels,
+                   IdxT n_rows,
+                   IdxT n_cols,
+                   IdxT n_clusters,
+                   cudaStream_t stream,
+                   bool row_major,
+                   const DataT* centers,
+                   const DataT* cluster_std,
+                   const DataT cluster_std_scalar,
+                   raft::random::Rng& rng)
+{
+  auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) {
+    DataT mu1, sigma1, mu2, sigma2;
+    get_mu_sigma(mu1,
+                 sigma1,
+                 idx1,
+                 labels,
+                 row_major,
+                 centers,
+                 cluster_std,
+                 cluster_std_scalar,
+                 n_rows,
+                 n_cols,
+                 n_clusters);
+    get_mu_sigma(mu2,
+                 sigma2,
+                 idx2,
+                 labels,
+                 row_major,
+                 centers,
+                 cluster_std,
+                 cluster_std_scalar,
+                 n_rows,
+                 n_cols,
+                 n_clusters);
+    raft::random::box_muller_transform<DataT>(val1, val2, sigma1, mu1, sigma2, mu2);
+  };
+  rng.custom_distribution2<DataT, DataT, IdxT>(out, n_rows * n_cols, op, stream);
+}
+
+}  // namespace
+
+template <typename DataT, typename IdxT>
+void make_blobs_caller(DataT* out,
+                       IdxT* labels,
+                       IdxT n_rows,
+                       IdxT n_cols,
+                       IdxT n_clusters,
+                       cudaStream_t stream,
+                       bool row_major                   = true,
+                       const DataT* centers             = nullptr,
+                       const DataT* cluster_std         = nullptr,
+                       const DataT cluster_std_scalar   = (DataT)1.0,
+                       bool shuffle                     = true,
+                       DataT center_box_min             = (DataT)-10.0,
+                       DataT center_box_max             = (DataT)10.0,
+                       uint64_t seed                    = 0ULL,
+                       raft::random::GeneratorType type = raft::random::GenPhilox)
+{
+  raft::random::Rng r(seed, type);
+  // use the right centers buffer for data generation
+  rmm::device_uvector<DataT> rand_centers(0, stream);
+  const DataT* _centers;
+  if (centers == nullptr) {
+    rand_centers.resize(n_clusters * n_cols, stream);
+    r.uniform(rand_centers.data(), n_clusters * n_cols, center_box_min, center_box_max, stream);
+    _centers = rand_centers.data();
+  } else {
+    _centers = centers;
+  }
+  generate_labels(labels, n_rows, n_clusters, shuffle, r, stream);
+  generate_data(out,
+                labels,
+                n_rows,
+                n_cols,
+                n_clusters,
+                stream,
+                row_major,
+                _centers,
+                cluster_std,
+                cluster_std_scalar,
+                r);
+}
+
+}  // end namespace detail
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
new file mode 100644
index 0000000000..eb8eaf565e
--- /dev/null
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from scikit-learn
+ * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/init.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <raft/mr/device/buffer.hpp>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::random {
+namespace detail {
+
+/* Internal auxiliary function to help build the singular profile */
+template <typename DataT, typename IdxT>
+static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_strength, IdxT rank)
+{
+  IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n) {
+    DataT sval     = static_cast<DataT>(tid) / rank;
+    DataT low_rank = ((DataT)1.0 - tail_strength) * raft::myExp(-sval * sval);
+    DataT tail     = tail_strength * raft::myExp((DataT)-0.1 * sval);
+    out[tid]       = low_rank + tail;
+  }
+}
+
+/* Internal auxiliary function to generate a low-rank matrix */
+template <typename DataT, typename IdxT>
+static void _make_low_rank_matrix(const raft::handle_t& handle,
+                                  DataT* out,
+                                  IdxT n_rows,
+                                  IdxT n_cols,
+                                  IdxT effective_rank,
+                                  DataT tail_strength,
+                                  raft::random::Rng& r,
+                                  cudaStream_t stream)
+{
+  cusolverDnHandle_t cusolver_handle = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublas_handle       = handle.get_cublas_handle();
+
+  IdxT n = std::min(n_rows, n_cols);
+
+  // Generate random (ortho normal) vectors with QR decomposition
+  rmm::device_uvector<DataT> rd_mat_0(n_rows * n, stream);
+  rmm::device_uvector<DataT> rd_mat_1(n_cols * n, stream);
+  r.normal(rd_mat_0.data(), n_rows * n, (DataT)0.0, (DataT)1.0, stream);
+  r.normal(rd_mat_1.data(), n_cols * n, (DataT)0.0, (DataT)1.0, stream);
+  rmm::device_uvector<DataT> q0(n_rows * n, stream);
+  rmm::device_uvector<DataT> q1(n_cols * n, stream);
+  raft::linalg::qrGetQ(handle, rd_mat_0.data(), q0.data(), n_rows, n, stream);
+  raft::linalg::qrGetQ(handle, rd_mat_1.data(), q1.data(), n_cols, n, stream);
+
+  // Build the singular profile by assembling signal and noise components
+  rmm::device_uvector<DataT> singular_vec(n, stream);
+  _singular_profile_kernel<<<raft::ceildiv<IdxT>(n, 256), 256, 0, stream>>>(
+    singular_vec.data(), n, tail_strength, effective_rank);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  rmm::device_uvector<DataT> singular_mat(n * n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(singular_mat.data(), 0, n * n * sizeof(DataT), stream));
+  raft::matrix::initializeDiagonalMatrix(singular_vec.data(), singular_mat.data(), n, n, stream);
+
+  // Generate the column-major matrix
+  rmm::device_uvector<DataT> temp_q0s(n_rows * n, stream);
+  rmm::device_uvector<DataT> temp_out(n_rows * n_cols, stream);
+  DataT alpha = 1.0, beta = 0.0;
+  raft::linalg::detail::cublasgemm(cublas_handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_N,
+                                   n_rows,
+                                   n,
+                                   n,
+                                   &alpha,
+                                   q0.data(),
+                                   n_rows,
+                                   singular_mat.data(),
+                                   n,
+                                   &beta,
+                                   temp_q0s.data(),
+                                   n_rows,
+                                   stream);
+  raft::linalg::detail::cublasgemm(cublas_handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   n_rows,
+                                   n_cols,
+                                   n,
+                                   &alpha,
+                                   temp_q0s.data(),
+                                   n_rows,
+                                   q1.data(),
+                                   n_cols,
+                                   &beta,
+                                   temp_out.data(),
+                                   n_rows,
+                                   stream);
+
+  // Transpose from column-major to row-major
+  raft::linalg::transpose(handle, temp_out.data(), out, n_rows, n_cols, stream);
+}
+
+/* Internal auxiliary function to permute rows in the given matrix according
+ * to a given permutation vector */
+template <typename DataT, typename IdxT>
+static __global__ void _gather2d_kernel(
+  DataT* out, const DataT* in, const IdxT* perms, IdxT n_rows, IdxT n_cols)
+{
+  IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tid < n_rows) {
+    const DataT* row_in = in + n_cols * perms[tid];
+    DataT* row_out      = out + n_cols * tid;
+
+    for (IdxT i = 0; i < n_cols; i++) {
+      row_out[i] = row_in[i];
+    }
+  }
+}
+
+template <typename DataT, typename IdxT>
+void make_regression_caller(const raft::handle_t& handle,
+                            DataT* out,
+                            DataT* values,
+                            IdxT n_rows,
+                            IdxT n_cols,
+                            IdxT n_informative,
+                            cudaStream_t stream,
+                            DataT* coef                      = nullptr,
+                            IdxT n_targets                   = (IdxT)1,
+                            DataT bias                       = (DataT)0.0,
+                            IdxT effective_rank              = (IdxT)-1,
+                            DataT tail_strength              = (DataT)0.5,
+                            DataT noise                      = (DataT)0.0,
+                            bool shuffle                     = true,
+                            uint64_t seed                    = 0ULL,
+                            raft::random::GeneratorType type = raft::random::GenPhilox)
+{
+  n_informative = std::min(n_informative, n_cols);
+
+  cusolverDnHandle_t cusolver_handle = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublas_handle       = handle.get_cublas_handle();
+
+  cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);
+  raft::random::Rng r(seed, type);
+
+  if (effective_rank < 0) {
+    // Randomly generate a well conditioned input set
+    r.normal(out, n_rows * n_cols, (DataT)0.0, (DataT)1.0, stream);
+  } else {
+    // Randomly generate a low rank, fat tail input set
+    _make_low_rank_matrix(handle, out, n_rows, n_cols, effective_rank, tail_strength, r, stream);
+  }
+
+  // Use the right output buffer for the values
+  rmm::device_uvector<DataT> tmp_values(0, stream);
+  DataT* _values;
+  if (shuffle) {
+    tmp_values.resize(n_rows * n_targets, stream);
+    _values = tmp_values.data();
+  } else {
+    _values = values;
+  }
+  // Create a column-major matrix of output values only if it has more
+  // than 1 column
+  rmm::device_uvector<DataT> values_col(0, stream);
+  DataT* _values_col;
+  if (n_targets > 1) {
+    values_col.resize(n_rows * n_targets, stream);
+    _values_col = values_col.data();
+  } else {
+    _values_col = _values;
+  }
+
+  // Use the right buffer for the coefficients
+  rmm::device_uvector<DataT> tmp_coef(0, stream);
+  DataT* _coef;
+  if (coef != nullptr && !shuffle) {
+    _coef = coef;
+  } else {
+    tmp_coef.resize(n_cols * n_targets, stream);
+    _coef = tmp_coef.data();
+  }
+
+  // Generate a ground truth model with only n_informative features
+  r.uniform(_coef, n_informative * n_targets, (DataT)1.0, (DataT)100.0, stream);
+  if (coef && n_informative != n_cols) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(_coef + n_informative * n_targets,
+                                  0,
+                                  (n_cols - n_informative) * n_targets * sizeof(DataT),
+                                  stream));
+  }
+
+  // Compute the output values
+  DataT alpha = (DataT)1.0, beta = (DataT)0.0;
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_OP_T,
+                                                   n_rows,
+                                                   n_targets,
+                                                   n_informative,
+                                                   &alpha,
+                                                   out,
+                                                   n_cols,
+                                                   _coef,
+                                                   n_targets,
+                                                   &beta,
+                                                   _values_col,
+                                                   n_rows,
+                                                   stream));
+
+  // Transpose the values from column-major to row-major if needed
+  if (n_targets > 1) {
+    raft::linalg::transpose(handle, _values_col, _values, n_rows, n_targets, stream);
+  }
+
+  if (bias != 0.0) {
+    // Add bias
+    raft::linalg::addScalar(_values, _values, bias, n_rows * n_targets, stream);
+  }
+
+  rmm::device_uvector<DataT> white_noise(0, stream);
+  if (noise != 0.0) {
+    // Add white noise
+    white_noise.resize(n_rows * n_targets, stream);
+    r.normal(white_noise.data(), n_rows * n_targets, (DataT)0.0, noise, stream);
+    raft::linalg::add(_values, _values, white_noise.data(), n_rows * n_targets, stream);
+  }
+
+  if (shuffle) {
+    rmm::device_uvector<DataT> tmp_out(n_rows * n_cols, stream);
+    rmm::device_uvector<IdxT> perms_samples(n_rows, stream);
+    rmm::device_uvector<IdxT> perms_features(n_cols, stream);
+
+    constexpr IdxT Nthreads = 256;
+
+    // Shuffle the samples from out to tmp_out
+    raft::random::permute<DataT, IdxT, IdxT>(
+      perms_samples.data(), tmp_out.data(), out, n_cols, n_rows, true, stream);
+    IdxT nblks_rows = raft::ceildiv<IdxT>(n_rows, Nthreads);
+    _gather2d_kernel<<<nblks_rows, Nthreads, 0, stream>>>(
+      values, _values, perms_samples.data(), n_rows, n_targets);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // Shuffle the features from tmp_out to out
+    raft::random::permute<DataT, IdxT, IdxT>(
+      perms_features.data(), out, tmp_out.data(), n_rows, n_cols, false, stream);
+
+    // Shuffle the coefficients accordingly
+    if (coef != nullptr) {
+      IdxT nblks_cols = raft::ceildiv<IdxT>(n_cols, Nthreads);
+      _gather2d_kernel<<<nblks_cols, Nthreads, 0, stream>>>(
+        coef, _coef, perms_features.data(), n_cols, n_targets);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
new file mode 100644
index 0000000000..bf79b3cb71
--- /dev/null
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "curand_wrappers.hpp"
+#include <cmath>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+#include <stdio.h>
+
+// mvg.cuh takes in matrices that are colomn major (as in fortan)
+#define IDX2C(i, j, ld) (j * ld + i)
+
+namespace raft::random {
+namespace detail {
+
+enum Filler : unsigned char {
+  LOWER,  // = 0
+  UPPER   // = 1
+};        // used in memseting upper/lower matrix
+
+/**
+ * @brief Reset values within the epsilon absolute range to zero
+ * @tparam T the data type
+ * @param eig the array
+ * @param epsilon the range
+ * @param size length of the array
+ * @param stream cuda stream
+ */
+template <typename T>
+void epsilonToZero(T* eig, T epsilon, int size, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    eig,
+    eig,
+    size,
+    [epsilon] __device__(T in) { return (in < epsilon && in > -epsilon) ? T(0.0) : in; },
+    stream);
+}
+
+/**
+ * @brief Broadcast addition of vector onto a matrix
+ * @tparam the data type
+ * @param out the output matrix
+ * @param in_m the input matrix
+ * @param in_v the input vector
+ * @param scalar scalar multiplier
+ * @param rows number of rows in the input matrix
+ * @param cols number of cols in the input matrix
+ * @param stream cuda stream
+ */
+template <typename T>
+void matVecAdd(
+  T* out, const T* in_m, const T* in_v, T scalar, int rows, int cols, cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    out,
+    in_m,
+    in_v,
+    cols,
+    rows,
+    true,
+    true,
+    [=] __device__(T mat, T vec) { return mat + scalar * vec; },
+    stream);
+}
+
+// helper kernels
+template <typename T>
+__global__ void combined_dot_product(int rows, int cols, const T* W, T* matrix, int* check)
+{
+  int m_i = threadIdx.x + blockDim.x * blockIdx.x;
+  int Wi  = m_i / cols;
+  if (m_i < cols * rows) {
+    if (W[Wi] >= 0.0)
+      matrix[m_i] = pow(W[Wi], 0.5) * (matrix[m_i]);
+    else
+      check[0] = Wi;  // reports Wi'th eigen values is negative.
+  }
+}
+
+template <typename T>  // if uplo = 0, lower part of dim x dim matrix set to
+// value
+__global__ void fill_uplo(int dim, Filler uplo, T value, T* A)
+{
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  int i = threadIdx.y + blockDim.y * blockIdx.y;
+  if (i < dim && j < dim) {
+    // making off-diagonals == value
+    if (i < j) {
+      if (uplo == 1) A[IDX2C(i, j, dim)] = value;
+    } else if (i > j) {
+      if (uplo == 0) A[IDX2C(i, j, dim)] = value;
+    }
+  }
+}
+
+template <typename T>
+class multi_variable_gaussian_impl {
+ public:
+  enum Decomposer : unsigned char { chol_decomp, jacobi, qr };
+
+ private:
+  // adjustable stuff
+  const int dim;
+  const int nPoints     = 1;
+  const double tol      = 1.e-7;
+  const T epsilon       = 1.e-12;
+  const int max_sweeps  = 100;
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+  const Decomposer method;
+
+  // not so much
+  T *P = 0, *X = 0, *x = 0, *workspace_decomp = 0, *eig = 0;
+  int *info, Lwork, info_h;
+  syevjInfo_t syevj_params = NULL;
+  curandGenerator_t gen;
+  const raft::handle_t& handle;
+  cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
+  bool deinitilized      = false;
+
+ public:  // functions
+  multi_variable_gaussian_impl() = delete;
+  multi_variable_gaussian_impl(const raft::handle_t& handle, const int dim, Decomposer method)
+    : handle(handle), dim(dim), method(method)
+  {
+    auto cusolverHandle = handle.get_cusolver_dn_handle();
+
+    CURAND_CHECK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(gen, 28));  // SEED
+    if (method == chol_decomp) {
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
+        cusolverHandle, uplo, dim, P, dim, &Lwork));
+    } else if (method == jacobi) {  // jacobi init
+      RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
+      RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
+      RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, max_sweeps));
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevj_bufferSize(
+        cusolverHandle, jobz, uplo, dim, P, dim, eig, &Lwork, syevj_params));
+    } else {  // method == qr
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevd_bufferSize(
+        cusolverHandle, jobz, uplo, dim, P, dim, eig, &Lwork));
+    }
+  }
+
+  std::size_t get_workspace_size()
+  {
+    // malloc workspace_decomp
+    std::size_t granuality = 256, offset = 0;
+    workspace_decomp = (T*)offset;
+    offset += raft::alignTo(sizeof(T) * Lwork, granuality);
+    eig = (T*)offset;
+    offset += raft::alignTo(sizeof(T) * dim, granuality);
+    info = (int*)offset;
+    offset += raft::alignTo(sizeof(int), granuality);
+    return offset;
+  }
+
+  void set_workspace(T* workarea)
+  {
+    workspace_decomp = (T*)((std::size_t)workspace_decomp + (std::size_t)workarea);
+    eig              = (T*)((std::size_t)eig + (std::size_t)workarea);
+    info             = (int*)((std::size_t)info + (std::size_t)workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    auto cusolverHandle = handle.get_cusolver_dn_handle();
+    auto cublasHandle   = handle.get_cublas_handle();
+    auto cudaStream     = handle.get_stream();
+    if (method == chol_decomp) {
+      // lower part will contains chol_decomp
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(
+        cusolverHandle, uplo, dim, P, dim, workspace_decomp, Lwork, info, cudaStream));
+    } else if (method == jacobi) {
+      RAFT_CUSOLVER_TRY(
+        raft::linalg::detail::cusolverDnsyevj(cusolverHandle,
+                                              jobz,
+                                              uplo,
+                                              dim,
+                                              P,
+                                              dim,
+                                              eig,
+                                              workspace_decomp,
+                                              Lwork,
+                                              info,
+                                              syevj_params,
+                                              cudaStream));  // vectors stored as cols. & col major
+    } else {                                                 // qr
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevd(
+        cusolverHandle, jobz, uplo, dim, P, dim, eig, workspace_decomp, Lwork, info, cudaStream));
+    }
+    raft::update_host(&info_h, info, 1, cudaStream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(cudaStream));
+    ASSERT(info_h == 0, "mvg: error in syevj/syevd/potrf, info=%d | expected=0", info_h);
+    T mean = 0.0, stddv = 1.0;
+    // generate nxN gaussian nums in X
+    CURAND_CHECK(
+      detail::curandGenerateNormal(gen, X, (nPoints * dim) + (nPoints * dim) % 2, mean, stddv));
+    T alfa = 1.0, beta = 0.0;
+    if (method == chol_decomp) {
+      // upper part (0) being filled with 0.0
+      dim3 block(32, 32);
+      dim3 grid(raft::ceildiv(dim, (int)block.x), raft::ceildiv(dim, (int)block.y));
+      fill_uplo<T><<<grid, block, 0, cudaStream>>>(dim, UPPER, (T)0.0, P);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // P is lower triangular chol decomp mtrx
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_N,
+                                                       dim,
+                                                       nPoints,
+                                                       dim,
+                                                       &alfa,
+                                                       P,
+                                                       dim,
+                                                       X,
+                                                       dim,
+                                                       &beta,
+                                                       X,
+                                                       dim,
+                                                       cudaStream));
+    } else {
+      epsilonToZero(eig, epsilon, dim, cudaStream);
+      dim3 block(64);
+      dim3 grid(raft::ceildiv(dim, (int)block.x));
+      RAFT_CUDA_TRY(cudaMemsetAsync(info, 0, sizeof(int), cudaStream));
+      grid.x = raft::ceildiv(dim * dim, (int)block.x);
+      combined_dot_product<T><<<grid, block, 0, cudaStream>>>(dim, dim, eig, P, info);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // checking if any eigen vals were negative
+      raft::update_host(&info_h, info, 1, cudaStream);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(cudaStream));
+      ASSERT(info_h == 0, "mvg: Cov matrix has %dth Eigenval negative", info_h);
+
+      // Got Q = eigvect*eigvals.sqrt in P, Q*X in X below
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_N,
+                                                       dim,
+                                                       nPoints,
+                                                       dim,
+                                                       &alfa,
+                                                       P,
+                                                       dim,
+                                                       X,
+                                                       dim,
+                                                       &beta,
+                                                       X,
+                                                       dim,
+                                                       cudaStream));
+    }
+    // working to make mean not 0
+    // since we are working with column-major, nPoints and dim are swapped
+    if (x != NULL) matVecAdd(X, X, x, T(1.0), nPoints, dim, cudaStream);
+  }
+
+  void deinit()
+  {
+    if (deinitilized) return;
+    CURAND_CHECK(curandDestroyGenerator(gen));
+    RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
+    deinitilized = true;
+  }
+
+  ~multi_variable_gaussian_impl() { deinit(); }
+};  // end of multi_variable_gaussian_impl
+
+};  // end of namespace detail
+};  // end of namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh
new file mode 100644
index 0000000000..28eaf9136c
--- /dev/null
+++ b/cpp/include/raft/random/detail/permute.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <memory>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/vectorized.cuh>
+
+namespace raft::random {
+namespace detail {
+
+template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor>
+__global__ void permuteKernel(
+  IntType* perms, Type* out, const Type* in, IdxType a, IdxType b, IdxType N, IdxType D)
+{
+  namespace cg        = cooperative_groups;
+  const int WARP_SIZE = 32;
+
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // having shuffled input indices and coalesced output indices appears
+  // to be preferrable to the reverse, especially for column major
+  IntType inIdx  = ((a * int64_t(tid)) + b) % N;
+  IntType outIdx = tid;
+
+  if (perms != nullptr && tid < N) { perms[outIdx] = inIdx; }
+
+  if (out == nullptr || in == nullptr) { return; }
+
+  if (rowMajor) {
+    cg::thread_block_tile<WARP_SIZE> warp = cg::tiled_partition<WARP_SIZE>(cg::this_thread_block());
+
+    __shared__ IntType inIdxShm[TPB];
+    __shared__ IntType outIdxShm[TPB];
+    inIdxShm[threadIdx.x]  = inIdx;
+    outIdxShm[threadIdx.x] = outIdx;
+    warp.sync();
+
+    int warpID = threadIdx.x / WARP_SIZE;
+    int laneID = threadIdx.x % WARP_SIZE;
+    for (int i = warpID * WARP_SIZE; i < warpID * WARP_SIZE + WARP_SIZE; ++i) {
+      if (outIdxShm[i] < N) {
+#pragma unroll
+        for (int j = laneID; j < D; j += WARP_SIZE) {
+          out[outIdxShm[i] * D + j] = in[inIdxShm[i] * D + j];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int j = 0; j < D; ++j) {
+      if (tid < N) { out[outIdx + j * N] = in[inIdx + j * N]; }
+    }
+  }
+}
+
+// This is wrapped in a type to allow for partial template specialization
+template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor, int VLen>
+struct permute_impl_t {
+  static void permuteImpl(IntType* perms,
+                          Type* out,
+                          const Type* in,
+                          IdxType N,
+                          IdxType D,
+                          int nblks,
+                          IdxType a,
+                          IdxType b,
+                          cudaStream_t stream)
+  {
+    // determine vector type and set new pointers
+    typedef typename raft::IOType<Type, VLen>::Type VType;
+    VType* vout      = reinterpret_cast<VType*>(out);
+    const VType* vin = reinterpret_cast<const VType*>(in);
+
+    // check if we can execute at this vector length
+    if (D % VLen == 0 && raft::is_aligned(vout, sizeof(VType)) &&
+        raft::is_aligned(vin, sizeof(VType))) {
+      permuteKernel<VType, IntType, IdxType, TPB, rowMajor>
+        <<<nblks, TPB, 0, stream>>>(perms, vout, vin, a, b, N, D / VLen);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    } else {  // otherwise try the next lower vector length
+      permute_impl_t<Type, IntType, IdxType, TPB, rowMajor, VLen / 2>::permuteImpl(
+        perms, out, in, N, D, nblks, a, b, stream);
+    }
+  }
+};
+
+// at vector length 1 we just execute a scalar version to break the recursion
+template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor>
+struct permute_impl_t<Type, IntType, IdxType, TPB, rowMajor, 1> {
+  static void permuteImpl(IntType* perms,
+                          Type* out,
+                          const Type* in,
+                          IdxType N,
+                          IdxType D,
+                          int nblks,
+                          IdxType a,
+                          IdxType b,
+                          cudaStream_t stream)
+  {
+    permuteKernel<Type, IntType, IdxType, TPB, rowMajor>
+      <<<nblks, TPB, 0, stream>>>(perms, out, in, a, b, N, D);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
+void permute(IntType* perms,
+             Type* out,
+             const Type* in,
+             IntType D,
+             IntType N,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  auto nblks = raft::ceildiv(N, (IntType)TPB);
+
+  // always keep 'a' to be coprime to N
+  IdxType a = rand() % N;
+  while (raft::gcd(a, N) != 1)
+    a = (a + 1) % N;
+  IdxType b = rand() % N;
+
+  if (rowMajor) {
+    permute_impl_t<Type,
+                   IntType,
+                   IdxType,
+                   TPB,
+                   true,
+                   (16 / sizeof(Type) > 0) ? 16 / sizeof(Type) : 1>::permuteImpl(perms,
+                                                                                 out,
+                                                                                 in,
+                                                                                 N,
+                                                                                 D,
+                                                                                 nblks,
+                                                                                 a,
+                                                                                 b,
+                                                                                 stream);
+  } else {
+    permute_impl_t<Type, IntType, IdxType, TPB, false, 1>::permuteImpl(
+      perms, out, in, N, D, nblks, a, b, stream);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
new file mode 100644
index 0000000000..afdabfe55b
--- /dev/null
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/make_blobs.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
+template <typename DataT, typename IdxT>
+void make_blobs(DataT* out,
+                IdxT* labels,
+                IdxT n_rows,
+                IdxT n_cols,
+                IdxT n_clusters,
+                cudaStream_t stream,
+                bool row_major                 = true,
+                const DataT* centers           = nullptr,
+                const DataT* cluster_std       = nullptr,
+                const DataT cluster_std_scalar = (DataT)1.0,
+                bool shuffle                   = true,
+                DataT center_box_min           = (DataT)-10.0,
+                DataT center_box_max           = (DataT)10.0,
+                uint64_t seed                  = 0ULL,
+                GeneratorType type             = GenPhilox)
+{
+  detail::make_blobs_caller(out,
+                            labels,
+                            n_rows,
+                            n_cols,
+                            n_clusters,
+                            stream,
+                            row_major,
+                            centers,
+                            cluster_std,
+                            cluster_std_scalar,
+                            shuffle,
+                            center_box_min,
+                            center_box_max,
+                            seed,
+                            type);
+}
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
new file mode 100644
index 0000000000..d6fceff466
--- /dev/null
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from scikit-learn
+ * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include "detail/make_regression.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
+ * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
+ *
+ * @tparam  DataT  Scalar type
+ * @tparam  IdxT   Index type
+ *
+ * @param[in]   handle          RAFT handle
+ * @param[out]  out             Row-major (samples, features) matrix to store
+ *                              the problem data
+ * @param[out]  values          Row-major (samples, targets) matrix to store
+ *                              the values for the regression problem
+ * @param[in]   n_rows          Number of samples
+ * @param[in]   n_cols          Number of features
+ * @param[in]   n_informative   Number of informative features (non-zero
+ *                              coefficients)
+ * @param[in]   stream          CUDA stream
+ * @param[out]  coef            Row-major (features, targets) matrix to store
+ *                              the coefficients used to generate the values
+ *                              for the regression problem. If nullptr is
+ *                              given, nothing will be written
+ * @param[in]   n_targets       Number of targets (generated values per sample)
+ * @param[in]   bias            A scalar that will be added to the values
+ * @param[in]   effective_rank  The approximate rank of the data matrix (used
+ *                              to create correlations in the data). -1 is the
+ *                              code to use well-conditioned data
+ * @param[in]   tail_strength   The relative importance of the fat noisy tail
+ *                              of the singular values profile if
+ *                              effective_rank is not -1
+ * @param[in]   noise           Standard deviation of the gaussian noise
+ *                              applied to the output
+ * @param[in]   shuffle         Shuffle the samples and the features
+ * @param[in]   seed            Seed for the random number generator
+ * @param[in]   type            Random generator type
+ */
+template <typename DataT, typename IdxT>
+void make_regression(const raft::handle_t& handle,
+                     DataT* out,
+                     DataT* values,
+                     IdxT n_rows,
+                     IdxT n_cols,
+                     IdxT n_informative,
+                     cudaStream_t stream,
+                     DataT* coef         = nullptr,
+                     IdxT n_targets      = (IdxT)1,
+                     DataT bias          = (DataT)0.0,
+                     IdxT effective_rank = (IdxT)-1,
+                     DataT tail_strength = (DataT)0.5,
+                     DataT noise         = (DataT)0.0,
+                     bool shuffle        = true,
+                     uint64_t seed       = 0ULL,
+                     GeneratorType type  = GenPhilox)
+{
+  detail::make_regression_caller(handle,
+                                 out,
+                                 values,
+                                 n_rows,
+                                 n_cols,
+                                 n_informative,
+                                 stream,
+                                 coef,
+                                 n_targets,
+                                 bias,
+                                 effective_rank,
+                                 tail_strength,
+                                 noise,
+                                 shuffle,
+                                 seed,
+                                 type);
+}
+
+}  // namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
new file mode 100644
index 0000000000..c2af52322a
--- /dev/null
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/multi_variable_gaussian.cuh"
+
+namespace raft::random {
+
+template <typename T>
+class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
+ public:
+  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
+
+  multi_variable_gaussian() = delete;
+  multi_variable_gaussian(const raft::handle_t& handle,
+                          const int dim,
+                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
+    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
+  {
+  }
+
+  std::size_t get_workspace_size()
+  {
+    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
+  }
+
+  void set_workspace(T* workarea)
+  {
+    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
+  }
+
+  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
+
+  ~multi_variable_gaussian() { deinit(); }
+};  // end of multi_variable_gaussian
+
+};  // end of namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
new file mode 100644
index 0000000000..32ed3779e4
--- /dev/null
+++ b/cpp/include/raft/random/permute.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/permute.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief Generate permutations of the input array. Pretty useful primitive for
+ * shuffling the input datasets in ML algos. See note at the end for some of its
+ * limitations!
+ * @tparam Type Data type of the array to be shuffled
+ * @tparam IntType Integer type used for ther perms array
+ * @tparam IdxType Integer type used for addressing indices
+ * @tparam TPB threads per block
+ * @param perms the output permutation indices. Typically useful only when
+ * one wants to refer back. If you don't need this, pass a nullptr
+ * @param out the output shuffled array. Pass nullptr if you don't want this to
+ * be written. For eg: when you only want the perms array to be filled.
+ * @param in input array (in-place is not supported due to race conditions!)
+ * @param D number of columns of the input array
+ * @param N length of the input array (or number of rows)
+ * @param rowMajor whether the input/output matrices are row or col major
+ * @param stream cuda stream where to launch the work
+ *
+ * @note This is NOT a uniform permutation generator! In fact, it only generates
+ * very small percentage of permutations. If your application really requires a
+ * high quality permutation generator, it is recommended that you pick
+ * Knuth Shuffle.
+ */
+template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
+void permute(IntType* perms,
+             Type* out,
+             const Type* in,
+             IntType D,
+             IntType N,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
+}
+
+};  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 75b415814a..654ab73f84 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -74,6 +74,10 @@ add_executable(test_raft
     test/mr/host/buffer.cpp
     test/mr/device/buffer.cpp
     test/mst.cu
+    test/random/make_blobs.cu
+    test/random/make_regression.cu
+    test/random/multi_variable_gaussian.cu
+    test/random/permute.cu
     test/random/rng.cu
     test/random/rng_int.cu
     test/random/sample_without_replacement.cu
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
new file mode 100644
index 0000000000..8c7e440d0e
--- /dev/null
+++ b/cpp/test/random/make_blobs.cu
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cub/cub.cuh>
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/make_blobs.hpp>
+
+namespace raft::random {
+
+template <typename T>
+__global__ void meanKernel(T* out,
+                           int* lens,
+                           const T* data,
+                           const int* labels,
+                           int nrows,
+                           int ncols,
+                           int nclusters,
+                           bool row_major)
+{
+  int tid   = threadIdx.x + blockIdx.x * blockDim.x;
+  int rowid = row_major ? tid / ncols : tid % nrows;
+  int colid = row_major ? tid % ncols : tid / nrows;
+  if (rowid < nrows && colid < ncols) {
+    T val     = data[tid];
+    int label = labels[rowid];
+    int idx   = row_major ? label * ncols + colid : colid * nclusters + label;
+    raft::myAtomicAdd(out + idx * 2, val);
+    raft::myAtomicAdd(out + idx * 2 + 1, val * val);
+    if (colid == 0) { raft::myAtomicAdd(lens + label, 1); }
+  }
+}
+
+template <typename T>
+__global__ void compute_mean_var(
+  T* out, const T* stats, int* lens, int nrows, int ncols, bool row_major)
+{
+  int tid    = threadIdx.x + blockIdx.x * blockDim.x;
+  int rowid  = row_major ? tid / ncols : tid % nrows;
+  int colid  = row_major ? tid % ncols : tid / nrows;
+  int stride = nrows * ncols;
+  if (rowid < nrows && colid < ncols) {
+    int len           = lens[rowid];
+    auto mean         = stats[tid * 2] / len;
+    out[tid]          = mean;
+    out[tid + stride] = (stats[tid * 2 + 1] / len) - (mean * mean);
+  }
+}
+
+template <typename T>
+struct MakeBlobsInputs {
+  T tolerance;
+  int rows, cols, n_clusters;
+  T std;
+  bool row_major, shuffle;
+  raft::random::GeneratorType gtype;
+  uint64_t seed;
+};
+
+template <typename T>
+class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
+ public:
+  MakeBlobsTest()
+    : params(::testing::TestWithParam<MakeBlobsInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      mu_vec(params.cols * params.n_clusters, stream),
+      mean_var(2 * params.n_clusters * params.cols, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    // Tests are configured with their expected test-values sigma. For example,
+    // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
+    num_sigma = 50;
+    auto len  = params.rows * params.cols;
+    raft::random::Rng r(params.seed, params.gtype);
+
+    rmm::device_uvector<T> data(len, stream);
+    rmm::device_uvector<int> labels(params.rows, stream);
+    rmm::device_uvector<T> stats(2 * params.n_clusters * params.cols, stream);
+    rmm::device_uvector<int> lens(params.n_clusters, stream);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, stats.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(mean_var.data(), 0, mean_var.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(lens.data(), 0, lens.size() * sizeof(int), stream));
+
+    r.uniform(mu_vec.data(), params.cols * params.n_clusters, T(-10.0), T(10.0), stream);
+    T* sigma_vec = nullptr;
+    make_blobs(data.data(),
+               labels.data(),
+               params.rows,
+               params.cols,
+               params.n_clusters,
+               stream,
+               params.row_major,
+               mu_vec.data(),
+               sigma_vec,
+               params.std,
+               params.shuffle,
+               T(-10.0),
+               T(10.0),
+               params.seed,
+               params.gtype);
+    static const int threads = 128;
+    meanKernel<T><<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats.data(),
+                                                                       lens.data(),
+                                                                       data.data(),
+                                                                       labels.data(),
+                                                                       params.rows,
+                                                                       params.cols,
+                                                                       params.n_clusters,
+                                                                       params.row_major);
+    int len1 = params.n_clusters * params.cols;
+    compute_mean_var<T><<<raft::ceildiv(len1, threads), threads, 0, stream>>>(
+      mean_var.data(), stats.data(), lens.data(), params.n_clusters, params.cols, params.row_major);
+  }
+
+  void check()
+  {
+    int len      = params.n_clusters * params.cols;
+    auto compare = raft::CompareApprox<T>(num_sigma * params.tolerance);
+    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream));
+    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream));
+  }
+
+ protected:
+  MakeBlobsInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+
+  rmm::device_uvector<T> mu_vec, mean_var;
+  int num_sigma;
+};
+
+typedef MakeBlobsTest<float> MakeBlobsTestF;
+const std::vector<MakeBlobsInputs<float>> inputsf_t = {
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+};
+
+TEST_P(MakeBlobsTestF, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestF, ::testing::ValuesIn(inputsf_t));
+
+typedef MakeBlobsTest<double> MakeBlobsTestD;
+const std::vector<MakeBlobsInputs<double>> inputsd_t = {
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+};
+TEST_P(MakeBlobsTestD, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
new file mode 100644
index 0000000000..01c3008cd3
--- /dev/null
+++ b/cpp/test/random/make_regression.cu
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+
+#include "../test_utils.h"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/random/make_regression.hpp>
+
+namespace raft::random {
+
+template <typename T>
+struct MakeRegressionInputs {
+  T tolerance;
+  int n_samples, n_features, n_informative, n_targets, effective_rank;
+  T bias;
+  bool shuffle;
+  raft::random::GeneratorType gtype;
+  uint64_t seed;
+};
+
+template <typename T>
+class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<T>> {
+ public:
+  MakeRegressionTest()
+    : params(::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      values_ret(params.n_samples * params.n_targets, stream),
+      values_prod(params.n_samples * params.n_targets, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    // Noise must be zero to compare the actual and expected values
+    T noise = (T)0.0, tail_strength = (T)0.5;
+
+    rmm::device_uvector<T> data(params.n_samples * params.n_features, stream);
+    rmm::device_uvector<T> values_cm(params.n_samples * params.n_targets, stream);
+    rmm::device_uvector<T> coef(params.n_features * params.n_targets, stream);
+
+    // Create the regression problem
+    make_regression(handle,
+                    data.data(),
+                    values_ret.data(),
+                    params.n_samples,
+                    params.n_features,
+                    params.n_informative,
+                    stream,
+                    coef.data(),
+                    params.n_targets,
+                    params.bias,
+                    params.effective_rank,
+                    tail_strength,
+                    noise,
+                    params.shuffle,
+                    params.seed,
+                    params.gtype);
+
+    // Calculate the values from the data and coefficients (column-major)
+    T alpha = (T)1.0, beta = (T)0.0;
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_T,
+                                                     params.n_samples,
+                                                     params.n_targets,
+                                                     params.n_features,
+                                                     &alpha,
+                                                     data.data(),
+                                                     params.n_features,
+                                                     coef.data(),
+                                                     params.n_targets,
+                                                     &beta,
+                                                     values_cm.data(),
+                                                     params.n_samples,
+                                                     stream));
+
+    // Transpose the values to row-major
+    raft::linalg::transpose(
+      handle, values_cm.data(), values_prod.data(), params.n_samples, params.n_targets, stream);
+
+    // Add the bias
+    raft::linalg::addScalar(values_prod.data(),
+                            values_prod.data(),
+                            params.bias,
+                            params.n_samples * params.n_targets,
+                            stream);
+
+    // Count the number of zeroes in the coefficients
+    thrust::device_ptr<T> __coef = thrust::device_pointer_cast(coef.data());
+    zero_count = thrust::count(__coef, __coef + params.n_features * params.n_targets, (T)0.0);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+
+  MakeRegressionInputs<T> params;
+  rmm::device_uvector<T> values_ret, values_prod;
+  int zero_count;
+};
+
+typedef MakeRegressionTest<float> MakeRegressionTestF;
+const std::vector<MakeRegressionInputs<float>> inputsf_t = {
+  {0.01f, 256, 32, 16, 1, -1, 0.f, true, raft::random::GenPhilox, 1234ULL},
+  {0.01f, 1000, 100, 47, 4, 65, 4.2f, true, raft::random::GenPhilox, 1234ULL},
+  {0.01f, 20000, 500, 450, 13, -1, -3.f, false, raft::random::GenPhilox, 1234ULL}};
+
+TEST_P(MakeRegressionTestF, Result)
+{
+  ASSERT_TRUE(match(params.n_targets * (params.n_features - params.n_informative),
+                    zero_count,
+                    raft::Compare<int>()));
+  ASSERT_TRUE(devArrMatch(values_ret.data(),
+                          values_prod.data(),
+                          params.n_samples,
+                          params.n_targets,
+                          raft::CompareApprox<float>(params.tolerance),
+                          stream));
+}
+INSTANTIATE_TEST_CASE_P(MakeRegressionTests, MakeRegressionTestF, ::testing::ValuesIn(inputsf_t));
+
+typedef MakeRegressionTest<double> MakeRegressionTestD;
+const std::vector<MakeRegressionInputs<double>> inputsd_t = {
+  {0.01, 256, 32, 16, 1, -1, 0.0, true, raft::random::GenPhilox, 1234ULL},
+  {0.01, 1000, 100, 47, 4, 65, 4.2, true, raft::random::GenPhilox, 1234ULL},
+  {0.01, 20000, 500, 450, 13, -1, -3.0, false, raft::random::GenPhilox, 1234ULL}};
+
+TEST_P(MakeRegressionTestD, Result)
+{
+  ASSERT_TRUE(match(params.n_targets * (params.n_features - params.n_informative),
+                    zero_count,
+                    raft::Compare<int>()));
+  ASSERT_TRUE(devArrMatch(values_ret.data(),
+                          values_prod.data(),
+                          params.n_samples,
+                          params.n_targets,
+                          raft::CompareApprox<double>(params.tolerance),
+                          stream));
+}
+INSTANTIATE_TEST_CASE_P(MakeRegressionTests, MakeRegressionTestD, ::testing::ValuesIn(inputsd_t));
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
new file mode 100644
index 0000000000..daafdbc754
--- /dev/null
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cmath>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/random/multi_variable_gaussian.hpp>
+#include <random>
+#include <rmm/device_uvector.hpp>
+
+// mvg.h takes in matrices that are colomn major (as in fortan)
+#define IDX2C(i, j, ld) (j * ld + i)
+
+namespace raft::random {
+
+// helper kernels
+/// @todo Duplicate called vctwiseAccumulate in utils.h (Kalman Filters,
+// i think that is much better to use., more general)
+template <typename T>
+__global__ void En_KF_accumulate(const int nPoints, const int dim, const T* X, T* x)
+{
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int col = idx % dim;
+  int row = idx / dim;
+  if (col < dim && row < nPoints) raft::myAtomicAdd(x + col, X[idx]);
+}
+
+template <typename T>
+__global__ void En_KF_normalize(const int divider, const int dim, T* x)
+{
+  int xi = threadIdx.x + blockDim.x * blockIdx.x;
+  if (xi < dim) x[xi] = x[xi] / divider;
+}
+
+template <typename T>
+__global__ void En_KF_dif(const int nPoints, const int dim, const T* X, const T* x, T* X_diff)
+{
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int col = idx % dim;
+  int row = idx / dim;
+  if (col < dim && row < nPoints) X_diff[idx] = X[idx] - x[col];
+}
+
+// for specialising tests
+enum Correlation : unsigned char {
+  CORRELATED,  // = 0
+  UNCORRELATED
+};
+
+template <typename T>
+struct MVGInputs {
+  T tolerance;
+  typename multi_variable_gaussian<T>::Decomposer method;
+  Correlation corr;
+  int dim, nPoints;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const MVGInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
+ protected:
+  MVGTest()
+    : workspace_d(0, handle.get_stream()),
+      P_d(0, handle.get_stream()),
+      x_d(0, handle.get_stream()),
+      X_d(0, handle.get_stream()),
+      Rand_cov(0, handle.get_stream()),
+      Rand_mean(0, handle.get_stream())
+  {
+  }
+
+  void SetUp() override
+  {
+    // getting params
+    params    = ::testing::TestWithParam<MVGInputs<T>>::GetParam();
+    dim       = params.dim;
+    nPoints   = params.nPoints;
+    method    = params.method;
+    corr      = params.corr;
+    tolerance = params.tolerance;
+
+    auto cublasH   = handle.get_cublas_handle();
+    auto cusolverH = handle.get_cusolver_dn_handle();
+    auto stream    = handle.get_stream();
+
+    // preparing to store stuff
+    P.resize(dim * dim);
+    x.resize(dim);
+    X.resize(dim * nPoints);
+    P_d.resize(dim * dim, stream);
+    X_d.resize(nPoints * dim, stream);
+    x_d.resize(dim, stream);
+    Rand_cov.resize(dim * dim, stream);
+    Rand_mean.resize(dim, stream);
+
+    // generating random mean and cov.
+    srand(params.seed);
+    for (int j = 0; j < dim; j++)
+      x.data()[j] = rand() % 100 + 5.0f;
+
+    // for random Cov. martix
+    std::default_random_engine generator(params.seed);
+    std::uniform_real_distribution<T> distribution(0.0, 1.0);
+
+    // P (developing a +ve definite symm matrix)
+    for (int j = 0; j < dim; j++) {
+      for (int i = 0; i < j + 1; i++) {
+        T k = distribution(generator);
+        if (corr == UNCORRELATED) k = 0.0;
+        P.data()[IDX2C(i, j, dim)] = k;
+        P.data()[IDX2C(j, i, dim)] = k;
+        if (i == j) P.data()[IDX2C(i, j, dim)] += dim;
+      }
+    }
+
+    // porting inputs to gpu
+    raft::update_device(P_d.data(), P.data(), dim * dim, stream);
+    raft::update_device(x_d.data(), x.data(), dim, stream);
+
+    // initilizing the mvg
+    mvg           = new multi_variable_gaussian<T>(handle, dim, method);
+    std::size_t o = mvg->get_workspace_size();
+
+    // give the workspace area to mvg
+    workspace_d.resize(o, stream);
+    mvg->set_workspace(workspace_d.data());
+
+    // get gaussians in X_d | P_d is destroyed.
+    mvg->give_gaussian(nPoints, P_d.data(), X_d.data(), x_d.data());
+
+    // saving the mean of the randoms in Rand_mean
+    //@todo can be swapped with a API that calculates mean
+    RAFT_CUDA_TRY(cudaMemset(Rand_mean.data(), 0, dim * sizeof(T)));
+    dim3 block = (64);
+    dim3 grid  = (raft::ceildiv(nPoints * dim, (int)block.x));
+    En_KF_accumulate<<<grid, block, 0, stream>>>(nPoints, dim, X_d.data(), Rand_mean.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    grid = (raft::ceildiv(dim, (int)block.x));
+    En_KF_normalize<<<grid, block, 0, stream>>>(nPoints, dim, Rand_mean.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // storing the error wrt random point mean in X_d
+    grid = (raft::ceildiv(dim * nPoints, (int)block.x));
+    En_KF_dif<<<grid, block, 0, stream>>>(nPoints, dim, X_d.data(), Rand_mean.data(), X_d.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // finding the cov matrix, placing in Rand_cov
+    T alfa = 1.0 / (nPoints - 1), beta = 0.0;
+
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasH,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     dim,
+                                                     dim,
+                                                     nPoints,
+                                                     &alfa,
+                                                     X_d.data(),
+                                                     dim,
+                                                     X_d.data(),
+                                                     dim,
+                                                     &beta,
+                                                     Rand_cov.data(),
+                                                     dim,
+                                                     stream));
+
+    // restoring cov provided into P_d
+    raft::update_device(P_d.data(), P.data(), dim * dim, stream);
+  }
+
+  void TearDown() override
+  {
+    // deleting mvg
+    delete mvg;
+  }
+
+ protected:
+  MVGInputs<T> params;
+  std::vector<T> P, x, X;
+  rmm::device_uvector<T> workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean;
+  int dim, nPoints;
+  typename multi_variable_gaussian<T>::Decomposer method;
+  Correlation corr;
+  multi_variable_gaussian<T>* mvg = NULL;
+  T tolerance;
+  raft::handle_t handle;
+};  // end of MVGTest class
+
+///@todo find out the reason that Un-correlated covs are giving problems (in qr)
+// Declare your inputs
+const std::vector<MVGInputs<float>> inputsf = {
+  {0.3f,
+   multi_variable_gaussian<float>::Decomposer::chol_decomp,
+   Correlation::CORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.1f,
+   multi_variable_gaussian<float>::Decomposer::chol_decomp,
+   Correlation::UNCORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.25f,
+   multi_variable_gaussian<float>::Decomposer::jacobi,
+   Correlation::CORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.1f,
+   multi_variable_gaussian<float>::Decomposer::jacobi,
+   Correlation::UNCORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.2f, multi_variable_gaussian<float>::Decomposer::qr, Correlation::CORRELATED, 5, 30000, 6ULL},
+  // { 0.2f,          multi_variable_gaussian<float>::Decomposer::qr,
+  // Correlation::UNCORRELATED, 5, 30000, 6ULL}
+};
+const std::vector<MVGInputs<double>> inputsd = {
+  {0.25,
+   multi_variable_gaussian<double>::Decomposer::chol_decomp,
+   Correlation::CORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.1,
+   multi_variable_gaussian<double>::Decomposer::chol_decomp,
+   Correlation::UNCORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.25,
+   multi_variable_gaussian<double>::Decomposer::jacobi,
+   Correlation::CORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.1,
+   multi_variable_gaussian<double>::Decomposer::jacobi,
+   Correlation::UNCORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.2,
+   multi_variable_gaussian<double>::Decomposer::qr,
+   Correlation::CORRELATED,
+   10,
+   3000000,
+   6ULL},
+  // { 0.2,          multi_variable_gaussian<double>::Decomposer::qr,
+  // Correlation::UNCORRELATED, 10, 3000000, 6ULL}
+};
+
+// make the tests
+typedef MVGTest<float> MVGTestF;
+typedef MVGTest<double> MVGTestD;
+TEST_P(MVGTestF, MeanIsCorrectF)
+{
+  EXPECT_TRUE(raft::devArrMatch(
+    x_d.data(), Rand_mean.data(), dim, raft::CompareApprox<float>(tolerance), handle.get_stream()))
+    << " in MeanIsCorrect";
+}
+TEST_P(MVGTestF, CovIsCorrectF)
+{
+  EXPECT_TRUE(raft::devArrMatch(P_d.data(),
+                                Rand_cov.data(),
+                                dim,
+                                dim,
+                                raft::CompareApprox<float>(tolerance),
+                                handle.get_stream()))
+    << " in CovIsCorrect";
+}
+TEST_P(MVGTestD, MeanIsCorrectD)
+{
+  EXPECT_TRUE(raft::devArrMatch(
+    x_d.data(), Rand_mean.data(), dim, raft::CompareApprox<double>(tolerance), handle.get_stream()))
+    << " in MeanIsCorrect";
+}
+TEST_P(MVGTestD, CovIsCorrectD)
+{
+  EXPECT_TRUE(raft::devArrMatch(P_d.data(),
+                                Rand_cov.data(),
+                                dim,
+                                dim,
+                                raft::CompareApprox<double>(tolerance),
+                                handle.get_stream()))
+    << " in CovIsCorrect";
+}
+
+// call the tests
+INSTANTIATE_TEST_CASE_P(MVGTests, MVGTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(MVGTests, MVGTestD, ::testing::ValuesIn(inputsd));
+
+};  // end of namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
new file mode 100644
index 0000000000..294444d409
--- /dev/null
+++ b/cpp/test/random/permute.cu
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <algorithm>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+#include <vector>
+
+namespace raft {
+namespace random {
+
+template <typename T>
+struct PermInputs {
+  int N, D;
+  bool needPerms, needShuffle, rowMajor;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const PermInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class PermTest : public ::testing::TestWithParam<PermInputs<T>> {
+ protected:
+  PermTest() : in(0, stream), out(0, stream), outPerms(0, stream) {}
+
+  void SetUp() override
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    params = ::testing::TestWithParam<PermInputs<T>>::GetParam();
+    // forcefully set needPerms, since we need it for unit-testing!
+    if (params.needShuffle) { params.needPerms = true; }
+    raft::random::Rng r(params.seed);
+    int N               = params.N;
+    int D               = params.D;
+    int len             = N * D;
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    if (params.needPerms) {
+      outPerms.resize(N, stream);
+      outPerms_ptr = outPerms.data();
+    }
+    if (params.needShuffle) {
+      in.resize(len, stream);
+      out.resize(len, stream);
+      in_ptr  = in.data();
+      out_ptr = out.data();
+      r.uniform(in_ptr, len, T(-1.0), T(1.0), stream);
+    }
+    permute(outPerms_ptr, out_ptr, in_ptr, D, N, params.rowMajor, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  PermInputs<T> params;
+  rmm::device_uvector<T> in, out;
+  T* in_ptr  = nullptr;
+  T* out_ptr = nullptr;
+  rmm::device_uvector<int> outPerms;
+  int* outPerms_ptr   = nullptr;
+  cudaStream_t stream = 0;
+};
+
+template <typename T, typename L>
+::testing::AssertionResult devArrMatchRange(
+  const T* actual, size_t size, T start, L eq_compare, bool doSort = true, cudaStream_t stream = 0)
+{
+  std::vector<T> act_h(size);
+  raft::update_host<T>(&(act_h[0]), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  if (doSort) std::sort(act_h.begin(), act_h.end());
+  for (size_t i(0); i < size; ++i) {
+    auto act      = act_h[i];
+    auto expected = start + i;
+    if (!eq_compare(expected, act)) {
+      return ::testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << expected << " @" << i;
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+::testing::AssertionResult devArrMatchShuffle(const int* perms,
+                                              const T* out,
+                                              const T* in,
+                                              int D,
+                                              int N,
+                                              bool rowMajor,
+                                              L eq_compare,
+                                              cudaStream_t stream = 0)
+{
+  std::vector<int> h_perms(N);
+  raft::update_host<int>(&(h_perms[0]), perms, N, stream);
+  std::vector<T> h_out(N * D), h_in(N * D);
+  raft::update_host<T>(&(h_out[0]), out, N * D, stream);
+  raft::update_host<T>(&(h_in[0]), in, N * D, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < D; ++j) {
+      int outPos    = rowMajor ? i * D + j : j * N + i;
+      int inPos     = rowMajor ? h_perms[i] * D + j : j * N + h_perms[i];
+      auto act      = h_out[outPos];
+      auto expected = h_in[inPos];
+      if (!eq_compare(expected, act)) {
+        return ::testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << expected << " @" << i << ", " << j;
+      }
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+const std::vector<PermInputs<float>> inputsf = {
+  // only generate permutations
+  {32, 8, true, false, true, 1234ULL},
+  {32, 8, true, false, true, 1234567890ULL},
+  {1024, 32, true, false, true, 1234ULL},
+  {1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024, 32, true, false, true, 1234ULL},
+  {2 * 1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234567890ULL},
+  {100000, 32, true, false, true, 1234ULL},
+  {100000, 32, true, false, true, 1234567890ULL},
+  {100001, 33, true, false, true, 1234567890ULL},
+  // permute and shuffle the data row major
+  {32, 8, true, true, true, 1234ULL},
+  {32, 8, true, true, true, 1234567890ULL},
+  {1024, 32, true, true, true, 1234ULL},
+  {1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024, 32, true, true, true, 1234ULL},
+  {2 * 1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234567890ULL},
+  {100000, 32, true, true, true, 1234ULL},
+  {100000, 32, true, true, true, 1234567890ULL},
+  {100001, 31, true, true, true, 1234567890ULL},
+  // permute and shuffle the data column major
+  {32, 8, true, true, false, 1234ULL},
+  {32, 8, true, true, false, 1234567890ULL},
+  {1024, 32, true, true, false, 1234ULL},
+  {1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024, 32, true, true, false, 1234ULL},
+  {2 * 1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234567890ULL},
+  {100000, 32, true, true, false, 1234ULL},
+  {100000, 32, true, true, false, 1234567890ULL},
+  {100001, 33, true, true, false, 1234567890ULL}};
+
+typedef PermTest<float> PermTestF;
+TEST_P(PermTestF, Result)
+{
+  if (params.needPerms) {
+    ASSERT_TRUE(devArrMatchRange(outPerms_ptr, params.N, 0, raft::Compare<int>()));
+  }
+  if (params.needShuffle) {
+    ASSERT_TRUE(devArrMatchShuffle(
+      outPerms_ptr, out_ptr, in_ptr, params.D, params.N, params.rowMajor, raft::Compare<float>()));
+  }
+}
+INSTANTIATE_TEST_CASE_P(PermTests, PermTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<PermInputs<double>> inputsd = {
+  // only generate permutations
+  {32, 8, true, false, true, 1234ULL},
+  {32, 8, true, false, true, 1234567890ULL},
+  {1024, 32, true, false, true, 1234ULL},
+  {1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024, 32, true, false, true, 1234ULL},
+  {2 * 1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234567890ULL},
+  {100000, 32, true, false, true, 1234ULL},
+  {100000, 32, true, false, true, 1234567890ULL},
+  {100001, 33, true, false, true, 1234567890ULL},
+  // permute and shuffle the data row major
+  {32, 8, true, true, true, 1234ULL},
+  {32, 8, true, true, true, 1234567890ULL},
+  {1024, 32, true, true, true, 1234ULL},
+  {1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024, 32, true, true, true, 1234ULL},
+  {2 * 1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234567890ULL},
+  {100000, 32, true, true, true, 1234ULL},
+  {100000, 32, true, true, true, 1234567890ULL},
+  {100001, 31, true, true, true, 1234567890ULL},
+  // permute and shuffle the data column major
+  {32, 8, true, true, false, 1234ULL},
+  {32, 8, true, true, false, 1234567890ULL},
+  {1024, 32, true, true, false, 1234ULL},
+  {1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024, 32, true, true, false, 1234ULL},
+  {2 * 1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234567890ULL},
+  {100000, 32, true, true, false, 1234ULL},
+  {100000, 32, true, true, false, 1234567890ULL},
+  {100001, 33, true, true, false, 1234567890ULL}};
+typedef PermTest<double> PermTestD;
+TEST_P(PermTestD, Result)
+{
+  if (params.needPerms) {
+    ASSERT_TRUE(devArrMatchRange(outPerms_ptr, params.N, 0, raft::Compare<int>()));
+  }
+  if (params.needShuffle) {
+    ASSERT_TRUE(devArrMatchShuffle(
+      outPerms_ptr, out_ptr, in_ptr, params.D, params.N, params.rowMajor, raft::Compare<double>()));
+  }
+}
+INSTANTIATE_TEST_CASE_P(PermTests, PermTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 210eefa850..5a389b8c87 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -96,7 +96,7 @@ TEST_P(COORemoveZeros, Result)
   raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
 
   op::coo_remove_zeros<float>(&in, &out, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 8d61ca06a8..ac5443d43b 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -73,7 +73,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
           ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
       raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index b550857797..edf7432c49 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -78,7 +78,7 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
                                      params.in_rows.size(),
                                      params.m,
                                      params.n);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(