diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 9ca3859e18..2406456404 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,18 +16,13 @@
 
 #pragma once
 
-#include <cstdio>
-#include <cstdlib>
 #include <curand_kernel.h>
 #include <raft/common/cub_wrappers.cuh>
 #include <raft/common/scatter.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <random>
 #include <rmm/device_uvector.hpp>
-#include <stdint.h>
-#include <type_traits>
 
 namespace raft {
 namespace random {
@@ -37,10 +32,99 @@ namespace detail {
 enum GeneratorType {
   /** curand-based philox generator */
   GenPhilox = 0,
-  /** LFSR taps generator */
-  GenTaps,
-  /** kiss99 generator (currently the fastest) */
-  GenKiss99
+  /** Permuted Congruential Generator */
+  GenPC
+};
+
+template <typename OutType>
+struct InvariantDistParams {
+  OutType const_val;
+};
+
+template <typename OutType>
+struct UniformDistParams {
+  OutType start;
+  OutType end;
+};
+
+template <typename OutType, typename DiffType>
+struct UniformIntDistParams {
+  OutType start;
+  OutType end;
+  DiffType diff;
+};
+
+template <typename OutType>
+struct NormalDistParams {
+  OutType mu;
+  OutType sigma;
+};
+
+template <typename IntType>
+struct NormalIntDistParams {
+  IntType mu;
+  IntType sigma;
+};
+
+template <typename OutType, typename LenType>
+struct NormalTableDistParams {
+  LenType n_rows;
+  LenType n_cols;
+  const OutType* mu_vec;
+  OutType sigma;
+  const OutType* sigma_vec;
+};
+
+template <typename OutType>
+struct BernoulliDistParams {
+  OutType prob;
+};
+
+template <typename OutType>
+struct ScaledBernoulliDistParams {
+  OutType prob;
+  OutType scale;
+};
+
+template <typename OutType>
+struct GumbelDistParams {
+  OutType mu;
+  OutType beta;
+};
+
+template <typename OutType>
+struct LogNormalDistParams {
+  OutType mu;
+  OutType sigma;
+};
+
+template <typename OutType>
+struct LogisticDistParams {
+  OutType mu;
+  OutType scale;
+};
+
+template <typename OutType>
+struct ExponentialDistParams {
+  OutType lambda;
+};
+
+template <typename OutType>
+struct RayleighDistParams {
+  OutType sigma;
+};
+
+template <typename OutType>
+struct LaplaceDistParams {
+  OutType mu;
+  OutType scale;
+};
+
+// Not really a distro, useful for sample without replacement function
+template <typename WeightsT, typename IdxT>
+struct SamplingParams {
+  IdxT* inIdxPtr;
+  const WeightsT* wts;
 };
 
 template <typename Type>
@@ -55,76 +139,261 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
+
 template <typename Type>
 DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 {
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 
-/**
- * @brief generator-agnostic way of generating random numbers
- * @tparam GenType the generator object that expose 'next' method
- */
-template <typename GenType>
-struct Generator {
-  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset)
-  {
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    InvariantDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  *val = params.const_val;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    UniformDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  *val = (res * (params.end - params.start)) + params.start;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    UniformIntDistParams<OutType, uint32_t> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  uint32_t x = 0;
+  uint32_t s = params.diff;
+  gen.next(x);
+  uint64_t m = uint64_t(x) * s;
+  uint32_t l = uint32_t(m);
+  if (l < s) {
+    uint32_t t = (-s) % s;  // (2^32 - s) mod s
+    while (l < t) {
+      gen.next(x);
+      m = uint64_t(x) * s;
+      l = uint32_t(m);
+    }
   }
+  *val = OutType(m >> 32) + params.start;
+}
 
-  template <typename Type>
-  DI void next(Type& ret)
-  {
-    gen.next(ret);
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    UniformIntDistParams<OutType, uint64_t> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  uint64_t x = 0;
+  gen.next(x);
+  uint64_t s = params.diff;
+  uint64_t m_lo, m_hi;
+  // m = x * s;
+  asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s));
+  asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s));
+  if (m_lo < s) {
+    uint64_t t = (-s) % s;  // (2^64 - s) mod s
+    while (m_lo < t) {
+      gen.next(x);
+      asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s));
+      asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s));
+    }
   }
+  *val = OutType(m_hi) + params.start;
+}
 
- private:
-  /** the actual generator */
-  GenType gen;
-};
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, NormalDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
+{
+  OutType res1, res2;
+  gen.next(res1);
+  gen.next(res2);
+  box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
+  *val       = res1;
+  *(val + 1) = res2;
+}
 
-template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda>
-__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp)
+template <typename GenType, typename IntType, typename LenType>
+DI void custom_next(GenType& gen,
+                    IntType* val,
+                    NormalIntDistParams<IntType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
 {
-  LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  Generator<GenType> gen(seed, (uint64_t)tid, offset);
-  const LenType stride = gridDim.x * blockDim.x;
-  for (LenType idx = tid; idx < len; idx += stride) {
-    MathType val;
-    gen.next(val);
-    ptr[idx] = randOp(val, idx);
-  }
+  IntType res1_int, res2_int;
+  gen.next(res1_int);
+  gen.next(res2_int);
+  double res1  = static_cast<double>(res1_int);
+  double res2  = static_cast<double>(res2_int);
+  double mu    = static_cast<double>(params.mu);
+  double sigma = static_cast<double>(params.sigma);
+  box_muller_transform<double>(res1, res2, sigma, mu);
+  *val       = static_cast<IntType>(res1);
+  *(val + 1) = static_cast<IntType>(res2);
 }
 
-// used for Box-Muller type transformations
-template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda2>
-__global__ void rand2Kernel(
-  uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op)
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    NormalTableDistParams<OutType, LenType> params,
+                    LenType idx,
+                    LenType stride)
 {
-  LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  Generator<GenType> gen(seed, (uint64_t)tid, offset);
-  const LenType stride = gridDim.x * blockDim.x;
-  for (LenType idx = tid; idx < len; idx += stride) {
-    MathType val1, val2;
-    gen.next(val1);
-    gen.next(val2);
-    rand2Op(val1, val2, idx, idx + stride);
-    if (idx < len) ptr[idx] = (OutType)val1;
-    idx += stride;
-    if (idx < len) ptr[idx] = (OutType)val2;
-  }
+  OutType res1, res2;
+  gen.next(res1);
+  gen.next(res2);
+  LenType col1  = idx % params.n_cols;
+  LenType col2  = (idx + stride) % params.n_cols;
+  OutType mean1 = params.mu_vec[col1];
+  OutType mean2 = params.mu_vec[col2];
+  OutType sig1  = params.sigma_vec == nullptr ? params.sigma : params.sigma_vec[col1];
+  OutType sig2  = params.sigma_vec == nullptr ? params.sigma : params.sigma_vec[col2];
+  box_muller_transform<OutType>(res1, res2, sig1, mean1, sig2, mean2);
+  *val       = res1;
+  *(val + 1) = res2;
 }
 
-template <typename Type>
-__global__ void constFillKernel(Type* ptr, int len, Type val)
+template <typename GenType, typename OutType, typename Type, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, BernoulliDistParams<Type> params, LenType idx = 0, LenType stride = 0)
+{
+  Type res = 0;
+  gen.next(res);
+  *val = res > params.prob;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    ScaledBernoulliDistParams<OutType> params,
+                    LenType idx,
+                    LenType stride)
+{
+  OutType res = 0;
+  gen.next(res);
+  *val = res > params.prob ? -params.scale : params.scale;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, GumbelDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
+{
+  OutType res = 0;
+  gen.next(res);
+  *val = params.mu - params.beta * raft::myLog(-raft::myLog(res));
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    LogNormalDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
 {
-  unsigned tid          = (blockIdx.x * blockDim.x) + threadIdx.x;
-  const unsigned stride = gridDim.x * blockDim.x;
-  for (unsigned idx = tid; idx < len; idx += stride) {
-    ptr[idx] = val;
+  OutType res1 = 0, res2 = 0;
+  gen.next(res1);
+  gen.next(res2);
+  box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
+  *val       = raft::myExp(res1);
+  *(val + 1) = raft::myExp(res2);
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    LogisticDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  constexpr OutType one = (OutType)1.0;
+  *val                  = params.mu - params.scale * raft::myLog(one / res - one);
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    ExponentialDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  constexpr OutType one = (OutType)1.0;
+  *val                  = -raft::myLog(one - res) / params.lambda;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    RayleighDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  constexpr OutType one = (OutType)1.0;
+  constexpr OutType two = (OutType)2.0;
+  *val                  = raft::mySqrt(-two * raft::myLog(one - res)) * params.sigma;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    LaplaceDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res, out;
+  gen.next(res);
+  constexpr OutType one     = (OutType)1.0;
+  constexpr OutType two     = (OutType)2.0;
+  constexpr OutType oneHalf = (OutType)0.5;
+  if (res <= oneHalf) {
+    out = params.mu + params.scale * raft::myLog(two * res);
+  } else {
+    out = params.mu - params.scale * raft::myLog(two * (one - res));
+  }
+  *val = out;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, SamplingParams<OutType, LenType> params, LenType idx, LenType stride)
+{
+  OutType res;
+  gen.next(res);
+  params.inIdxPtr[idx]  = idx;
+  constexpr OutType one = (OutType)1.0;
+  auto exp              = -raft::myLog(one - res);
+  if (params.wts != nullptr) {
+    *val = exp / params.wts[idx];
+  } else {
+    *val = exp;
   }
 }
 
+struct RngState {
+  uint64_t seed;
+  uint64_t base_subsequence;
+};
+
 /** Philox-based random number generator */
 // Courtesy: Jakub Szuppe
 struct PhiloxGenerator {
@@ -136,484 +405,416 @@ struct PhiloxGenerator {
    */
   DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
   {
-    curand_init(seed, subsequence, offset, &state);
+    curand_init(seed, subsequence, offset, &philox_state);
   }
 
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
-  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
-  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t& ret)
-  {
-    uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
-  }
-  DI void next(int32_t& ret)
-  {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret)
-  {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** the state for RNG */
-  curandStatePhilox4_32_10_t state;
-};
-
-/** LFSR taps-filter for generating random numbers. */
-// Courtesy: Vinay Deshpande
-struct TapsGenerator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
-   * @param offset unused
-   */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  DI PhiloxGenerator(const RngState& rng_state, const uint64_t subsequence)
   {
-    uint64_t delta  = (blockIdx.x * blockDim.x) + threadIdx.x;
-    uint64_t stride = blockDim.x * gridDim.x;
-    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
-    stride *= blockDim.y * gridDim.y;
-    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
-    state = seed + delta + 1;
+    curand_init(rng_state.seed, rng_state.base_subsequence + subsequence, 0, &philox_state);
   }
 
   /**
    * @defgroup NextRand Generate the next random number
    * @{
    */
-  template <typename Type>
-  DI void next(Type& ret)
-  {
-    constexpr double ULL_LARGE = 1.8446744073709551614e19;
-    uint64_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(ULL_LARGE);
-  }
-  DI void next(uint64_t& ret)
+  DI uint32_t next_u32()
   {
-    constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS    = 128;
-    for (int i = 0; i < ROUNDS; i++)
-      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
-    ret = state;
+    uint32_t ret = curand(&(this->philox_state));
+    return ret;
   }
-  DI void next(uint32_t& ret)
+
+  DI uint64_t next_u64()
   {
-    uint64_t val;
-    next(val);
-    ret = (uint32_t)val;
+    uint64_t ret;
+    uint32_t a, b;
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
   }
-  DI void next(int32_t& ret)
+
+  DI int32_t next_i32()
   {
+    int32_t ret;
     uint32_t val;
-    next(val);
+    val = next_u32();
     ret = int32_t(val & 0x7fffffff);
+    return ret;
   }
-  DI void next(int64_t& ret)
+
+  DI int64_t next_i64()
   {
+    int64_t ret;
     uint64_t val;
-    next(val);
+    val = next_u64();
     ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
   }
+
+  DI void next(float& ret) { ret = curand_uniform(&(this->philox_state)); }
+  DI void next(double& ret) { ret = curand_uniform_double(&(this->philox_state)); }
+
+  DI void next(uint32_t& ret) { ret = next_u32(); }
+  DI void next(uint64_t& ret) { ret = next_u64(); }
+  DI void next(int32_t& ret) { ret = next_i32(); }
+  DI void next(int64_t& ret) { ret = next_i64(); }
+
   /** @} */
 
  private:
   /** the state for RNG */
-  uint64_t state;
+  curandStatePhilox4_32_10_t philox_state;
 };
 
-/** Kiss99-based random number generator */
+/** PCG random number generator */
 
-struct Kiss99Generator {
+struct PCGenerator {
   /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
+   * @brief ctor. Initializes the state for RNG. This code is derived from PCG basic code
+   * @param seed the seed (can be same across all threads). Same as PCG's initstate
+   * @param subsequence is same as PCG's initseq
    * @param offset unused
    */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); }
+  DI PCGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
+    pcg_state = uint64_t(0);
+    inc       = (subsequence << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    pcg_state += seed;
+    next(discard);
+    skipahead(offset);
+  }
+
+  DI PCGenerator(const RngState& rng_state, const uint64_t subsequence)
+  {
+    pcg_state = uint64_t(0);
+    inc       = ((rng_state.base_subsequence + subsequence) << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    pcg_state += rng_state.seed;
+    next(discard);
+  }
+
+  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
+  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  DI void skipahead(uint64_t offset)
+  {
+    uint64_t G = 1;
+    uint64_t h = 6364136223846793005ULL;
+    uint64_t C = 0;
+    uint64_t f = inc;
+    while (offset) {
+      if (offset & 1) {
+        G = G * h;
+        C = C * h + f;
+      }
+      f = f * (h + 1);
+      h = h * h;
+      offset >>= 1;
+    }
+    pcg_state = pcg_state * G + C;
+  }
 
   /**
    * @defgroup NextRand Generate the next random number
+   * @brief This code is derived from PCG basic code
    * @{
    */
-  template <typename Type>
-  DI void next(Type& ret)
+  DI uint32_t next_u32()
   {
-    constexpr double U_LARGE = 4.294967295e9;
-    uint32_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(U_LARGE);
+    uint32_t ret;
+    uint64_t oldstate   = pcg_state;
+    pcg_state           = oldstate * 6364136223846793005ULL + inc;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot        = oldstate >> 59u;
+    ret                 = (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+    return ret;
   }
-  DI void next(uint32_t& ret)
-  {
-    uint32_t MWC;
-    z   = 36969 * (z & 65535) + (z >> 16);
-    w   = 18000 * (w & 65535) + (w >> 16);
-    MWC = ((z << 16) + w);
-    jsr ^= (jsr << 17);
-    jsr ^= (jsr >> 13);
-    jsr ^= (jsr << 5);
-    jcong = 69069 * jcong + 1234567;
-    MWC   = ((MWC ^ jcong) + jsr);
-    ret   = MWC;
-  }
-  DI void next(uint64_t& ret)
+  DI uint64_t next_u64()
   {
+    uint64_t ret;
     uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
   }
-  DI void next(int32_t& ret)
+
+  DI int32_t next_i32()
   {
+    int32_t ret;
     uint32_t val;
-    next(val);
+    val = next_u32();
     ret = int32_t(val & 0x7fffffff);
+    return ret;
   }
-  DI void next(int64_t& ret)
+
+  DI int64_t next_i64()
   {
+    int64_t ret;
     uint64_t val;
-    next(val);
+    val = next_u64();
     ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
   }
-  /** @} */
 
- private:
-  /** one of the kiss99 states */
-  uint32_t z;
-  /** one of the kiss99 states */
-  uint32_t w;
-  /** one of the kiss99 states */
-  uint32_t jsr;
-  /** one of the kiss99 states */
-  uint32_t jcong;
-
-  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
-  // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t* h)
-  {
-    typedef union {
-      uint32_t u32[2];
-      uint64_t u64[1];
-    } words64;
-
-    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
-    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
-    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
-    // Partial products
-    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
-
-    h0p0.u64[0] = uint64_t(h[0]) * p0;
-    h1p0.u64[0] = uint64_t(h[1]) * p0;
-    h2p0.u64[0] = uint64_t(h[2]) * p0;
-    h0p2.u64[0] = uint64_t(h[0]) * p2;
-    h3p0.u64[0] = uint64_t(h[3]) * p0;
-    h1p2.u64[0] = uint64_t(h[1]) * p2;
-
-    // h_n[0] = LO(h[0]*p[0]);
-    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
-    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
-    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
-    uint32_t carry = 0;
-    h[0]           = h0p0.u32[0];
-
-    h[1]  = h0p0.u32[1] + h1p0.u32[0];
-    carry = h[1] < h0p0.u32[1] ? 1 : 0;
-
-    h[2]  = h1p0.u32[1] + carry;
-    carry = h[2] < h1p0.u32[1] ? 1 : 0;
-    h[2] += h2p0.u32[0];
-    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
-    h[2] += h0p2.u32[0];
-    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
-
-    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
-    return;
+  DI float next_float()
+  {
+    float ret;
+    uint32_t val = next_u32() >> 8;
+    ret          = static_cast<float>(val) / (1U << 24);
+    return ret;
   }
 
-  DI void fnv1a128(uint32_t* hash, uint32_t txt)
+  DI double next_double()
   {
-    hash[0] ^= (txt >> 0) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 8) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 16) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 24) & 0xFF;
-    mulByFnv1a128Prime(hash);
+    double ret;
+    uint64_t val = next_u64() >> 11;
+    ret          = static_cast<double>(val) / (1LU << 53);
+    return ret;
   }
 
-  DI void initKiss99(uint64_t seed)
-  {
-    // Initialize hash to 128-bit FNV1a basis
-    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
+  DI void next(uint32_t& ret) { ret = next_u32(); }
+  DI void next(uint64_t& ret) { ret = next_u64(); }
+  DI void next(int32_t& ret) { ret = next_i32(); }
+  DI void next(int64_t& ret) { ret = next_i64(); }
 
-    // Digest threadIdx, blockIdx and seed
-    fnv1a128(hash, threadIdx.x);
-    fnv1a128(hash, threadIdx.y);
-    fnv1a128(hash, threadIdx.z);
-    fnv1a128(hash, blockIdx.x);
-    fnv1a128(hash, blockIdx.y);
-    fnv1a128(hash, blockIdx.z);
-    fnv1a128(hash, uint32_t(seed));
-    fnv1a128(hash, uint32_t(seed >> 32));
+  DI void next(float& ret) { ret = next_float(); }
+  DI void next(double& ret) { ret = next_double(); }
 
-    // Initialize KISS99 state with hash
-    z     = hash[0];
-    w     = hash[1];
-    jsr   = hash[2];
-    jcong = hash[3];
-  }
+  /** @} */
+
+ private:
+  uint64_t pcg_state;
+  uint64_t inc;
 };
 
-/** The main random number generator class, fully on GPUs */
+template <typename OutType,
+          typename LenType,
+          typename GenType,
+          int ITEMS_PER_CALL,
+          typename ParamType>
+__global__ void fillKernel(
+  uint64_t seed, uint64_t adv_subs, uint64_t offset, OutType* ptr, LenType len, ParamType params)
+{
+  LenType tid = threadIdx.x + blockIdx.x * blockDim.x;
+  GenType gen(seed, adv_subs + (uint64_t)tid, offset);
+  const LenType stride = gridDim.x * blockDim.x;
+  for (LenType idx = tid; idx < len; idx += stride * ITEMS_PER_CALL) {
+    OutType val[ITEMS_PER_CALL];
+    custom_next(gen, val, params, idx, stride);
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_CALL; i++) {
+      if ((idx + i * stride) < len) ptr[idx + i * stride] = val[i];
+    }
+  }
+  return;
+}
+
 class RngImpl {
  public:
-  RngImpl(uint64_t _s, GeneratorType _t = GenPhilox)
-    : type(_t),
-      offset(0),
+  RngImpl(uint64_t seed, GeneratorType _t = GenPhilox)
+    : state{seed, 0},
+      type(_t),
       // simple heuristic to make sure all SMs will be occupied properly
       // and also not too many initialization calls will be made by each thread
-      nBlocks(4 * getMultiProcessorCount()),
-      gen()
+      nBlocks(4 * getMultiProcessorCount())
   {
-    seed(_s);
-  }
-
-  void seed(uint64_t _s)
-  {
-    gen.seed(_s);
-    offset = 0;
   }
 
   template <typename IdxT>
   void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
   {
     // always keep 'a' to be coprime to 'n'
-    a = gen() % n;
+    std::mt19937_64 mt_rng(state.seed + state.base_subsequence);
+    a = mt_rng() % n;
     while (gcd(a, n) != 1) {
       ++a;
       if (a >= n) a = 0;
     }
     // the bias term 'b' can be any number in the range of [0, n)
-    b = gen() % n;
+    b = mt_rng() % n;
   }
 
-  template <typename Type, typename LenType = int>
-  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
   {
-    static_assert(std::is_floating_point<Type>::value,
+    static_assert(std::is_floating_point<OutType>::value,
                   "Type for 'uniform' can only be floating point!");
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; },
-      stream);
-  }
-  template <typename IntType, typename LenType = int>
-  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
-  {
-    static_assert(std::is_integral<IntType>::value, "Type for 'uniformInt' can only be integer!");
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; },
-      stream);
+    UniformDistParams<OutType> params;
+    params.start = start;
+    params.end   = end;
+    kernel_dispatch<OutType, LenType, 1, UniformDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<OutType>::value, "Type for 'uniformInt' can only be integer!");
+    ASSERT(end > start, "'end' must be greater than 'start'");
+    if (sizeof(OutType) == 4) {
+      UniformIntDistParams<OutType, uint32_t> params;
+      params.start = start;
+      params.end   = end;
+      params.diff  = uint32_t(params.end - params.start);
+      kernel_dispatch<OutType, LenType, 1, UniformIntDistParams<OutType, uint32_t>>(
+        ptr, len, stream, params);
+    } else {
+      UniformIntDistParams<OutType, uint64_t> params;
+      params.start = start;
+      params.end   = end;
+      params.diff  = uint64_t(params.end - params.start);
+      kernel_dispatch<OutType, LenType, 1, UniformIntDistParams<OutType, uint64_t>>(
+        ptr, len, stream, params);
+    }
   }
 
-  template <typename Type, typename LenType = int>
-  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
   {
-    static_assert(std::is_floating_point<Type>::value,
+    static_assert(std::is_floating_point<OutType>::value,
                   "Type for 'normal' can only be floating point!");
-    rand2Impl(
-      offset,
-      ptr,
-      len,
-      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
-        box_muller_transform<Type>(val1, val2, sigma, mu);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
+    NormalDistParams<OutType> params;
+    params.mu    = mu;
+    params.sigma = sigma;
+    kernel_dispatch<OutType, LenType, 2, NormalDistParams<OutType>>(ptr, len, stream, params);
   }
+
   template <typename IntType, typename LenType = int>
   void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
   {
     static_assert(std::is_integral<IntType>::value, "Type for 'normalInt' can only be integer!");
-    rand2Impl<IntType, double>(
-      offset,
-      ptr,
-      len,
-      [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) {
-        box_muller_transform<double>(val1, val2, sigma, mu);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void normalTable(Type* ptr,
+    NormalIntDistParams<IntType> params;
+    params.mu    = mu;
+    params.sigma = sigma;
+    kernel_dispatch<IntType, LenType, 2, NormalIntDistParams<IntType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void normalTable(OutType* ptr,
                    LenType n_rows,
                    LenType n_cols,
-                   const Type* mu,
-                   const Type* sigma_vec,
-                   Type sigma,
+                   const OutType* mu_vec,
+                   const OutType* sigma_vec,
+                   OutType sigma,
                    cudaStream_t stream)
   {
-    rand2Impl(
-      offset,
-      ptr,
-      n_rows * n_cols,
-      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
-        // yikes! use fast-int-div
-        auto col1  = idx1 % n_cols;
-        auto col2  = idx2 % n_cols;
-        auto mean1 = mu[col1];
-        auto mean2 = mu[col2];
-        auto sig1  = sigma_vec == nullptr ? sigma : sigma_vec[col1];
-        auto sig2  = sigma_vec == nullptr ? sigma : sigma_vec[col2];
-        box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
-  {
-    detail::constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    NormalTableDistParams<OutType, LenType> params;
+    params.n_rows    = n_rows;
+    params.n_cols    = n_cols;
+    params.mu_vec    = mu_vec;
+    params.sigma     = sigma;
+    params.sigma_vec = sigma_vec;
+    LenType len      = n_rows * n_cols;
+    kernel_dispatch<OutType, LenType, 2, NormalTableDistParams<OutType, LenType>>(
+      ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
+  {
+    InvariantDistParams<OutType> params;
+    params.const_val = val;
+    kernel_dispatch<OutType, LenType, 1, InvariantDistParams<OutType>>(ptr, len, stream, params);
   }
 
   template <typename Type, typename OutType = bool, typename LenType = int>
   void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
   {
-    custom_distribution<OutType, Type>(
-      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream);
+    BernoulliDistParams<Type> params;
+    params.prob = prob;
+    kernel_dispatch<OutType, LenType, 1, BernoulliDistParams<Type>>(ptr, len, stream, params);
   }
 
-  template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
   {
-    static_assert(std::is_floating_point<Type>::value,
+    static_assert(std::is_floating_point<OutType>::value,
                   "Type for 'scaled_bernoulli' can only be floating point!");
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
-  {
-    rand2Impl(
-      offset,
-      ptr,
-      len,
-      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
-        box_muller_transform<Type>(val1, val2, sigma, mu);
-        val1 = raft::myExp(val1);
-        val2 = raft::myExp(val2);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        return mu - scale * raft::myLog(one / val - one);
-      },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        return -raft::myLog(one - val) / lambda;
-      },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        constexpr Type two = (Type)2.0;
-        return raft::mySqrt(-two * raft::myLog(one - val)) * sigma;
-      },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one     = (Type)1.0;
-        constexpr Type two     = (Type)2.0;
-        constexpr Type oneHalf = (Type)0.5;
-        Type out;
-        if (val <= oneHalf) {
-          out = mu + scale * raft::myLog(two * val);
-        } else {
-          out = mu - scale * raft::myLog(two * (one - val));
-        }
-        return out;
-      },
-      stream);
+    ScaledBernoulliDistParams<OutType> params;
+    params.prob  = prob;
+    params.scale = scale;
+    kernel_dispatch<OutType, LenType, 1, ScaledBernoulliDistParams<OutType>>(
+      ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
+  {
+    GumbelDistParams<OutType> params;
+    params.mu   = mu;
+    params.beta = beta;
+    kernel_dispatch<OutType, LenType, 1, GumbelDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
+  {
+    LogNormalDistParams<OutType> params;
+    params.mu    = mu;
+    params.sigma = sigma;
+    kernel_dispatch<OutType, LenType, 2, LogNormalDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    LogisticDistParams<OutType> params;
+    params.mu    = mu;
+    params.scale = scale;
+    kernel_dispatch<OutType, LenType, 1, LogisticDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
+  {
+    ExponentialDistParams<OutType> params;
+    params.lambda = lambda;
+    kernel_dispatch<OutType, LenType, 1, ExponentialDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
+  {
+    RayleighDistParams<OutType> params;
+    params.sigma = sigma;
+    kernel_dispatch<OutType, LenType, 1, RayleighDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    LaplaceDistParams<OutType> params;
+    params.mu    = mu;
+    params.scale = scale;
+    kernel_dispatch<OutType, LenType, 1, LaplaceDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  void advance(uint64_t max_uniq_subsequences_used,
+               uint64_t max_numbers_generated_per_subsequence = 0)
+  {
+    state.base_subsequence += max_uniq_subsequences_used;
+  }
+
+  template <typename OutType, typename LenType, int ITEMS_PER_CALL, typename ParamType>
+  void kernel_dispatch(OutType* ptr, LenType len, cudaStream_t stream, ParamType params)
+  {
+    switch (type) {
+      case GenPhilox:
+        fillKernel<OutType, LenType, PhiloxGenerator, ITEMS_PER_CALL>
+          <<<nBlocks, nThreads, 0, stream>>>(
+            state.seed, state.base_subsequence, 0, ptr, len, params);
+        break;
+      case GenPC:
+        fillKernel<OutType, LenType, PCGenerator, ITEMS_PER_CALL><<<nBlocks, nThreads, 0, stream>>>(
+          state.seed, state.base_subsequence, 0, ptr, len, params);
+        break;
+      default: break;
+    }
+    // The max_numbers_generated_per_subsequence parameter does not matter for now, using 16 for now
+    advance(uint64_t(nBlocks) * nThreads, 16);
+    return;
   }
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
@@ -634,141 +835,28 @@ class RngImpl {
     rmm::device_uvector<IdxT> outIdxBuff(len, stream);
     auto* inIdxPtr = inIdx.data();
     // generate modified weights
-    custom_distribution(
-      expWts.data(),
-      len,
-      [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
-        inIdxPtr[idx]          = idx;
-        constexpr WeightsT one = (WeightsT)1.0;
-        auto exp               = -raft::myLog(one - val);
-        if (wts != nullptr) { return exp / wts[idx]; }
-        return exp;
-      },
-      stream);
+    SamplingParams<WeightsT, IdxT> params;
+    params.inIdxPtr = inIdxPtr;
+    params.wts      = wts;
+    kernel_dispatch<WeightsT, IdxT, 1, SamplingParams<WeightsT, IdxT>>(
+      expWts.data(), len, stream, params);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
     IdxT* outIdxPtr = outIdxBuff.data();
     rmm::device_uvector<char> workspace(0, stream);
     sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      RAFT_CUDA_TRY(cudaMemcpyAsync(
+      CUDA_CHECK(cudaMemcpyAsync(
         outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
-    raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
+    scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
 
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    randImpl<OutType, MathType, LenType, Lambda>(
-      offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
-  }
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    rand2Impl<OutType, MathType, LenType, Lambda>(
-      offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
-  }
-  /** @} */
-
- private:
-  /** generator type */
   GeneratorType type;
-  /**
-   * offset is also used to initialize curand state.
-   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
-   * but is still a large period.
-   */
-  uint64_t offset;
+  RngState state;
   /** number of blocks to launch */
   int nBlocks;
-  /** next seed generator for device-side RNG */
-  std::mt19937_64 gen;
-
-  static const int NumThreads = 256;
-
-  template <bool IsNormal, typename Type, typename LenType>
-  uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks)
-  {
-    LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
-    if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; }
-    // curand uses 2 32b uint's to generate one double
-    uint64_t factor = sizeof(Type) / sizeof(float);
-    if (factor == 0) ++factor;
-    // Check if there are enough random numbers left in sequence
-    // If not, then generate new seed and start from zero offset
-    uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
-    if (newOffset < offset) {
-      offset    = 0;
-      seed      = gen();
-      newOffset = itemsPerThread * factor;
-    }
-    return newOffset;
-  }
-
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void randImpl(uint64_t& offset,
-                OutType* ptr,
-                LenType len,
-                Lambda randOp,
-                int nThreads,
-                int nBlocks,
-                GeneratorType type,
-                cudaStream_t stream)
-  {
-    if (len <= 0) return;
-    uint64_t seed  = gen();
-    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
-    switch (type) {
-      case GenPhilox:
-        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
-        break;
-      case GenTaps:
-        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
-        break;
-      case GenKiss99:
-        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
-        break;
-      default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
-    };
-    RAFT_CUDA_TRY(cudaGetLastError());
-    offset = newOffset;
-  }
-
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda2>
-  void rand2Impl(uint64_t& offset,
-                 OutType* ptr,
-                 LenType len,
-                 Lambda2 rand2Op,
-                 int nThreads,
-                 int nBlocks,
-                 GeneratorType type,
-                 cudaStream_t stream)
-  {
-    if (len <= 0) return;
-    auto seed      = gen();
-    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
-    switch (type) {
-      case GenPhilox:
-        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda2>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
-        break;
-      case GenTaps:
-        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
-        break;
-      case GenKiss99:
-        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda2>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
-        break;
-      default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
-    };
-    RAFT_CUDA_TRY(cudaGetLastError());
-    offset = newOffset;
-  }
+  static const int nThreads = 256;
 };
 
 };  // end namespace detail
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 4ec25e71a2..2b1bdbccf7 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,21 +21,33 @@
 namespace raft {
 namespace random {
 
-/** all different generator types used */
+using detail::RngState;
+
 using detail::GeneratorType;
-/** curand-based philox generator */
+using detail::GenPC;
 using detail::GenPhilox;
-/** GenTaps : LFSR taps generator */
-using detail::GenTaps;
-/** GenKiss99 : kiss99 generator (currently the fastest) */
-using detail::GenKiss99;
 
-/** Philox-based random number generator */
+using detail::PCGenerator;
 using detail::PhiloxGenerator;
-/** LFSR taps-filter for generating random numbers. */
-using detail::TapsGenerator;
-/** Kiss99-based random number generator */
-using detail::Kiss99Generator;
+
+using detail::BernoulliDistParams;
+using detail::ExponentialDistParams;
+using detail::GumbelDistParams;
+using detail::InvariantDistParams;
+using detail::LaplaceDistParams;
+using detail::LogisticDistParams;
+using detail::LogNormalDistParams;
+using detail::NormalDistParams;
+using detail::NormalIntDistParams;
+using detail::NormalTableDistParams;
+using detail::RayleighDistParams;
+using detail::SamplingParams;
+using detail::ScaledBernoulliDistParams;
+using detail::UniformDistParams;
+using detail::UniformIntDistParams;
+
+// Not strictly needed due to C++ ADL rules
+using detail::custom_next;
 
 /**
  * @brief Helper method to compute Box Muller transform
@@ -53,8 +65,9 @@ using detail::Kiss99Generator;
 template <typename Type>
 DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
 {
-  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2);
+  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2);
 }
+
 template <typename Type>
 DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 {
@@ -62,7 +75,6 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 }
 /** @} */
 
-/** The main random number generator class, fully on GPUs */
 class Rng : public detail::RngImpl {
  public:
   /**
@@ -71,16 +83,7 @@ class Rng : public detail::RngImpl {
    * @param _t backend device RNG generator type
    * @note Refer to the `Rng::seed` method for details about seeding the engine
    */
-  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl{_s, _t} {}
-
-  /**
-   * @brief Seed (and thus re-initialize) the underlying RNG engine
-   * @param _s 64b seed used to initialize the RNG
-   * @note If you need non-reproducibility, pass a seed that's, for example, a
-   *       function of timestamp. Another example is to use the c++11's
-   *       `std::random_device` for setting seed.
-   */
-  void seed(uint64_t _s) { detail::RngImpl::seed(_s); }
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
 
   /**
    * @brief Generates the 'a' and 'b' parameters for a modulo affine
@@ -109,13 +112,14 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    * @{
    */
-  template <typename Type, typename LenType = int>
-  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
   {
     detail::RngImpl::uniform(ptr, len, start, end, stream);
   }
-  template <typename IntType, typename LenType = int>
-  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+
+  template <typename OutType, typename LenType = int>
+  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
   {
     detail::RngImpl::uniformInt(ptr, len, start, end, stream);
   }
@@ -132,11 +136,12 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    * @{
    */
-  template <typename Type, typename LenType = int>
-  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
   {
     detail::RngImpl::normal(ptr, len, mu, sigma, stream);
   }
+
   template <typename IntType, typename LenType = int>
   void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
   {
@@ -158,22 +163,22 @@ class Rng : public detail::RngImpl {
    * @param ptr the output table (dim = n_rows x n_cols)
    * @param n_rows number of rows in the table
    * @param n_cols number of columns in the table
-   * @param mu mean vector (dim = n_cols x 1).
+   * @param mu_vec mean vector (dim = n_cols x 1).
    * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
    * a nullptr to use the same scalar 'sigma' across all components
    * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void normalTable(Type* ptr,
+  template <typename OutType, typename LenType = int>
+  void normalTable(OutType* ptr,
                    LenType n_rows,
                    LenType n_cols,
-                   const Type* mu,
-                   const Type* sigma_vec,
-                   Type sigma,
+                   const OutType* mu_vec,
+                   const OutType* sigma_vec,
+                   OutType sigma,
                    cudaStream_t stream)
   {
-    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, stream);
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, stream);
   }
 
   /**
@@ -185,8 +190,8 @@ class Rng : public detail::RngImpl {
    * @param val value to be filled
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
   {
     detail::RngImpl::fill(ptr, len, val, stream);
   }
@@ -219,14 +224,14 @@ class Rng : public detail::RngImpl {
    * @param scale scaling factor
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
   {
     detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
   }
 
   /**
-   * @brief Generate gumbel distributed random numbers
+   * @brief Generate Gumbel distributed random numbers
    * @tparam Type data type of output random number
    * @tparam LenType data type used to represent length of the arrays
    * @param ptr output array
@@ -236,8 +241,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    * @note https://en.wikipedia.org/wiki/Gumbel_distribution
    */
-  template <typename Type, typename LenType = int>
-  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
   {
     detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
   }
@@ -252,8 +257,8 @@ class Rng : public detail::RngImpl {
    * @param sigma std-dev of the distribution
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
   {
     detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
   }
@@ -268,8 +273,8 @@ class Rng : public detail::RngImpl {
    * @param scale scale value
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
   {
     detail::RngImpl::logistic(ptr, len, mu, scale, stream);
   }
@@ -283,8 +288,8 @@ class Rng : public detail::RngImpl {
    * @param lambda the lambda
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
   {
     detail::RngImpl::exponential(ptr, len, lambda, stream);
   }
@@ -298,8 +303,8 @@ class Rng : public detail::RngImpl {
    * @param sigma the sigma
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
   {
     detail::RngImpl::rayleigh(ptr, len, sigma, stream);
   }
@@ -314,12 +319,17 @@ class Rng : public detail::RngImpl {
    * @param scale the scale
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
   {
     detail::RngImpl::laplace(ptr, len, mu, scale, stream);
   }
 
+  void advance(uint64_t max_streams, uint64_t max_calls_per_subsequence)
+  {
+    detail::RngImpl::advance(max_streams, max_calls_per_subsequence);
+  }
+
   /**
    * @brief Sample the input array without replacement, optionally based on the
    * input weight vector for each element in the array
@@ -359,33 +369,6 @@ class Rng : public detail::RngImpl {
     detail::RngImpl::sampleWithoutReplacement(
       handle, out, outIdx, in, wts, sampledLen, len, stream);
   }
-
-  /**
-   * @brief Core method to generate a pdf based on the cdf that is defined in
-   *        the input device lambda
-   *
-   * @tparam OutType  output type
-   * @tparam MathType type on which arithmetic is done
-   * @tparam LenTyp   index type
-   * @tparam Lambda   device lambda (or operator)
-   *
-   * @param[out] ptr    output buffer [on device] [len = len]
-   * @param[in]  len    number of elements to be generated
-   * @param[in]  randOp the device lambda or operator
-   * @param[in]  stream cuda stream
-   * @{
-   */
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    detail::RngImpl::custom_distribution(ptr, len, randOp, stream);
-  }
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    detail::RngImpl::custom_distribution2(ptr, len, randOp, stream);
-  }
-  /** @} */
 };
 
 };  // end namespace random
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 7aab2c18c0..e35835a445 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -21,7 +21,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eig.hpp>
-#include <raft/random/rng.hpp>
 
 namespace raft {
 namespace linalg {
@@ -30,10 +29,8 @@ template <typename T>
 struct EigSelInputs {
   T tolerance;
   int len;
-  int n_row;
-  int n_col;
-  unsigned long long int seed;
   int n;
+  int n_eigen_vals;
 };
 
 template <typename T>
@@ -49,10 +46,10 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     : params(::testing::TestWithParam<EigSelInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       cov_matrix(params.len, stream),
-      eig_vectors(12, stream),
-      eig_vectors_ref(12, stream),
-      eig_vals(params.n_col, stream),
-      eig_vals_ref(params.n_col, stream)
+      eig_vectors(params.n_eigen_vals * params.n, stream),
+      eig_vectors_ref(params.n_eigen_vals * params.n, stream),
+      eig_vals(params.n, stream),
+      eig_vals_ref(params.n, stream)
   {
   }
 
@@ -61,6 +58,7 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   {
     int len = params.len;
 
+    ///@todo: Generate a random symmetric matrix
     T cov_matrix_h[] = {
       1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
@@ -78,16 +76,17 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
                              0.5123,
                              0.5123,
                              0.4874};
-    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 3.5266};
+    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 0.0};
 
-    raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
-    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
+    raft::update_device(
+      eig_vectors_ref.data(), eig_vectors_ref_h, params.n_eigen_vals * params.n, stream);
+    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n, stream);
 
     raft::linalg::eigSelDC(handle,
                            cov_matrix.data(),
-                           params.n_row,
-                           params.n_col,
-                           3,
+                           params.n,
+                           params.n,
+                           params.n_eigen_vals,
                            eig_vectors.data(),
                            eig_vals.data(),
                            EigVecMemUsage::OVERWRITE_INPUT,
@@ -107,16 +106,16 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   rmm::device_uvector<T> eig_vals_ref;
 };
 
-const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 3}};
 
-const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 3}};
 
 typedef EigSelTest<float> EigSelTestValF;
 TEST_P(EigSelTestValF, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
-                                params.n_col,
+                                params.n_eigen_vals,
                                 raft::CompareApproxAbs<float>(params.tolerance),
                                 stream));
 }
@@ -126,7 +125,7 @@ TEST_P(EigSelTestValD, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
-                                params.n_col,
+                                params.n_eigen_vals,
                                 raft::CompareApproxAbs<double>(params.tolerance),
                                 stream));
 }
@@ -136,7 +135,7 @@ TEST_P(EigSelTestVecF, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
-                                12,
+                                params.n_eigen_vals * params.n,
                                 raft::CompareApproxAbs<float>(params.tolerance),
                                 stream));
 }
@@ -146,7 +145,7 @@ TEST_P(EigSelTestVecD, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
-                                12,
+                                params.n_eigen_vals * params.n,
                                 raft::CompareApproxAbs<double>(params.tolerance),
                                 stream));
 }
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 08e522d369..c63763d5a4 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 namespace raft {
 namespace random {
 
-using namespace raft::random::detail;
+using namespace raft::random;
 
 enum RandomType {
   RNG_Normal,
@@ -99,7 +99,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
-    num_sigma = 10;
+    num_sigma = 4;
     Rng r(params.seed, params.gtype);
     switch (params.type) {
       case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break;
@@ -211,7 +211,7 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL},
   {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
   {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 67632ULL},
   {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
   {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
   {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
@@ -220,39 +220,22 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
   {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
 
-  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL},
-  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL},
-  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL},
-  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL},
-  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL},
-  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL},
-  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL},
-
-  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL},
-  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL},
-  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL},
-  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL},
-  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
-  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
+  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenPC, 1234ULL},
+  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenPC, 1234ULL},
+  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenPC, 1234ULL},
+  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenPC, 1234ULL},
+  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPC, 1234ULL},
+  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPC, 1234ULL},
+  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPC, 1234ULL},
+  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPC, 1234ULL},
+  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPC, 1234ULL}};
 
 TEST_P(RngTestF, Result)
 {
@@ -273,7 +256,7 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL},
   {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
   {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 67632ULL},
   {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
   {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
   {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
@@ -282,39 +265,23 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
   {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
 
-  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL},
-  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL},
-  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL},
-  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL},
-  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL},
-  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL},
-  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL},
-
-  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL},
-  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL},
-  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL},
-  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL},
-  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
-  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
+  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenPC, 1234ULL},
+  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenPC, 1234ULL},
+  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenPC, 1234ULL},
+  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenPC, 1234ULL},
+  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPC, 1234ULL},
+  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPC, 1234ULL},
+  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPC, 1234ULL},
+  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPC, 1234ULL},
+  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPC, 1234ULL}};
+
 TEST_P(RngTestD, Result)
 {
   double meanvar[2];
@@ -381,7 +348,7 @@ TEST(Rng, MeanError)
   rmm::device_uvector<float> mean_result(num_experiments, stream);
   rmm::device_uvector<float> std_result(num_experiments, stream);
 
-  for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) {
+  for (auto rtype : {GenPhilox, GenPC}) {
     Rng r(seed, rtype);
     r.normal(data.data(), len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
@@ -560,10 +527,9 @@ typedef RngNormalTableTest<float> RngNormalTableTestF;
 const std::vector<RngNormalTableInputs<float>> inputsf_t = {
   {0.0055, 32, 1024, 1.f, 1.f, GenPhilox, 1234ULL},
   {0.011, 8, 1024, 1.f, 1.f, GenPhilox, 1234ULL},
-  {0.0055, 32, 1024, 1.f, 1.f, GenTaps, 1234ULL},
-  {0.011, 8, 1024, 1.f, 1.f, GenTaps, 1234ULL},
-  {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
-  {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
+
+  {0.0055, 32, 1024, 1.f, 1.f, GenPC, 1234ULL},
+  {0.011, 8, 1024, 1.f, 1.f, GenPC, 1234ULL}};
 
 TEST_P(RngNormalTableTestF, Result)
 {
@@ -578,10 +544,9 @@ typedef RngNormalTableTest<double> RngNormalTableTestD;
 const std::vector<RngNormalTableInputs<double>> inputsd_t = {
   {0.0055, 32, 1024, 1.0, 1.0, GenPhilox, 1234ULL},
   {0.011, 8, 1024, 1.0, 1.0, GenPhilox, 1234ULL},
-  {0.0055, 32, 1024, 1.0, 1.0, GenTaps, 1234ULL},
-  {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
-  {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
-  {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
+
+  {0.0055, 32, 1024, 1.0, 1.0, GenPC, 1234ULL},
+  {0.011, 8, 1024, 1.0, 1.0, GenPC, 1234ULL}};
 TEST_P(RngNormalTableTestD, Result)
 {
   double meanvar[2];
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 02c8dc9f39..2715181db1 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 namespace raft {
 namespace random {
 
-using namespace raft::random::detail;
+using namespace raft::random;
 
 enum RandomType { RNG_Uniform };
 
@@ -125,10 +125,9 @@ typedef RngTest<uint32_t> RngTestU32;
 const std::vector<RngInputs<uint32_t>> inputs_u32 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestU32, Result)
 {
   float meanvar[2];
@@ -142,10 +141,9 @@ typedef RngTest<uint64_t> RngTestU64;
 const std::vector<RngInputs<uint64_t>> inputs_u64 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestU64, Result)
 {
   float meanvar[2];
@@ -159,10 +157,9 @@ typedef RngTest<int32_t> RngTestS32;
 const std::vector<RngInputs<int32_t>> inputs_s32 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestS32, Result)
 {
   float meanvar[2];
@@ -176,10 +173,9 @@ typedef RngTest<int64_t> RngTestS64;
 const std::vector<RngInputs<int64_t>> inputs_s64 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestS64, Result)
 {
   float meanvar[2];
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index a8bba340fa..e469c366c3 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 namespace raft {
 namespace random {
 
-using namespace raft::random::detail;
+using namespace raft::random;
 
 // Terminology:
 // SWoR - Sample Without Replacement
@@ -91,67 +91,45 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
 };
 
 typedef SWoRTest<float> SWoRTestF;
-const std::vector<SWoRInputs<float>> inputsf = {
-  {1024, 512, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 1024, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 1024 - 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 512, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 1024, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 512, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 1024, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 512, 10, 100000.f, GenPhilox, 1234ULL},
-
-  {1024, 512, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 1024, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 1024 - 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 1024 - 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 512, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 1024, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 512, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 1024, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 512, 10, 100000.f, GenTaps, 1234ULL},
-
-  {1024, 512, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 1024, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 1024 - 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 512, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 1024, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 512, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 1024, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
-};
+const std::vector<SWoRInputs<float>> inputsf = {{1024, 512, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 1024, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 1024 - 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 512, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 1024, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 512, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 1024, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 1024 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 512, 10, 100000.f, GenPhilox, 1234ULL},
+
+                                                {1024, 512, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 1024, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 512 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 1024 - 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 512 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 1024 - 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 512, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 1024, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 512 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 1024 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 512 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 1024 - 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 512, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 1024, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 512 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 1024 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 512 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 1024 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 512, 10, 100000.f, GenPC, 1234ULL}};
 
 TEST_P(SWoRTestF, Result)
 {
@@ -173,67 +151,45 @@ TEST_P(SWoRTestF, Result)
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
 
 typedef SWoRTest<double> SWoRTestD;
-const std::vector<SWoRInputs<double>> inputsd = {
-  {1024, 512, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 1024, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 1024 - 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 512, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 1024, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 512, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 1024, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 512, 10, 100000.0, GenPhilox, 1234ULL},
-
-  {1024, 512, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 1024, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 1024 - 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 1024 - 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 512, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 1024, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 512, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 1024, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 512, 10, 100000.0, GenTaps, 1234ULL},
-
-  {1024, 512, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 1024, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 1024 - 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 512, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 1024, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 512, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 1024, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
-};
+const std::vector<SWoRInputs<double>> inputsd = {{1024, 512, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 1024, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 1024 - 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 512, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 1024, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 512, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 1024, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 1024 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 512, 10, 100000.0, GenPhilox, 1234ULL},
+
+                                                 {1024, 512, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 1024, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 512 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 1024 - 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 512 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 1024 - 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 512, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 1024, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 512 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 1024 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 512 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 1024 - 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 512, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 1024, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 512 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 1024 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 512 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 1024 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 512, 10, 100000.0, GenPC, 1234ULL}};
 
 TEST_P(SWoRTestD, Result)
 {
diff --git a/thirdparty/README.md b/thirdparty/README.md
new file mode 100644
index 0000000000..d9f05379dc
--- /dev/null
+++ b/thirdparty/README.md
@@ -0,0 +1,4 @@
+# PCG
+
+Link to the repository https://github.com/imneme/pcg-c-basic
+Commit ID for last borrowed code bc39cd76ac3d541e618606bcc6e1e5ba5e5e6aa3
diff --git a/thirdparty/pcg/LICENSE.txt b/thirdparty/pcg/LICENSE.txt
new file mode 100644
index 0000000000..8dada3edaf
--- /dev/null
+++ b/thirdparty/pcg/LICENSE.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/pcg/pcg_basic.c b/thirdparty/pcg/pcg_basic.c
new file mode 100644
index 0000000000..8c2fd0d22b
--- /dev/null
+++ b/thirdparty/pcg/pcg_basic.c
@@ -0,0 +1,116 @@
+/*
+ * PCG Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *       http://www.pcg-random.org
+ */
+
+/*
+ * This code is derived from the full C implementation, which is in turn
+ * derived from the canonical C++ PCG implementation. The C++ version
+ * has many additional features and is preferable if you can use C++ in
+ * your project.
+ */
+
+#include "pcg_basic.h"
+
+// state for global RNGs
+
+static pcg32_random_t pcg32_global = PCG32_INITIALIZER;
+
+// pcg32_srandom(initstate, initseq)
+// pcg32_srandom_r(rng, initstate, initseq):
+//     Seed the rng.  Specified in two parts, state initializer and a
+//     sequence selection constant (a.k.a. stream id)
+
+void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq)
+{
+    rng->state = 0U;
+    rng->inc = (initseq << 1u) | 1u;
+    pcg32_random_r(rng);
+    rng->state += initstate;
+    pcg32_random_r(rng);
+}
+
+void pcg32_srandom(uint64_t seed, uint64_t seq)
+{
+    pcg32_srandom_r(&pcg32_global, seed, seq);
+}
+
+// pcg32_random()
+// pcg32_random_r(rng)
+//     Generate a uniformly distributed 32-bit random number
+
+uint32_t pcg32_random_r(pcg32_random_t* rng)
+{
+    uint64_t oldstate = rng->state;
+    rng->state = oldstate * 6364136223846793005ULL + rng->inc;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
+
+uint32_t pcg32_random()
+{
+    return pcg32_random_r(&pcg32_global);
+}
+
+
+// pcg32_boundedrand(bound):
+// pcg32_boundedrand_r(rng, bound):
+//     Generate a uniformly distributed number, r, where 0 <= r < bound
+
+uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound)
+{
+    // To avoid bias, we need to make the range of the RNG a multiple of
+    // bound, which we do by dropping output less than a threshold.
+    // A naive scheme to calculate the threshold would be to do
+    //
+    //     uint32_t threshold = 0x100000000ull % bound;
+    //
+    // but 64-bit div/mod is slower than 32-bit div/mod (especially on
+    // 32-bit platforms).  In essence, we do
+    //
+    //     uint32_t threshold = (0x100000000ull-bound) % bound;
+    //
+    // because this version will calculate the same modulus, but the LHS
+    // value is less than 2^32.
+
+    uint32_t threshold = -bound % bound;
+
+    // Uniformity guarantees that this loop will terminate.  In practice, it
+    // should usually terminate quickly; on average (assuming all bounds are
+    // equally likely), 82.25% of the time, we can expect it to require just
+    // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
+    // (i.e., 2147483649), which invalidates almost 50% of the range.  In 
+    // practice, bounds are typically small and only a tiny amount of the range
+    // is eliminated.
+    for (;;) {
+        uint32_t r = pcg32_random_r(rng);
+        if (r >= threshold)
+            return r % bound;
+    }
+}
+
+
+uint32_t pcg32_boundedrand(uint32_t bound)
+{
+    return pcg32_boundedrand_r(&pcg32_global, bound);
+}
+