diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index 7f994fb07f..d6999d9236 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/integer_utils.hpp>
 
 #include <curand_kernel.h>
 
@@ -139,9 +140,9 @@ struct SamplingParams {
 };
 
 template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+HDI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
 {
-  constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
+  constexpr Type twoPi  = Type(2.0) * Type(3.141592653589793);
   constexpr Type minus2 = -Type(2.0);
   Type R                = raft::sqrt(minus2 * raft::log(val1));
   Type theta            = twoPi * val2;
@@ -152,27 +153,27 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type
 }
 
 template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+HDI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 {
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    InvariantDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     InvariantDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   *val = params.const_val;
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    UniformDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     UniformDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   OutType res;
   gen.next(res);
@@ -180,11 +181,11 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    UniformIntDistParams<OutType, uint32_t> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     UniformIntDistParams<OutType, uint32_t> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   uint32_t x = 0;
   uint32_t s = params.diff;
@@ -203,32 +204,31 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    UniformIntDistParams<OutType, uint64_t> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     UniformIntDistParams<OutType, uint64_t> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
+  using raft::wmul_64bit;
   uint64_t x = 0;
   gen.next(x);
   uint64_t s = params.diff;
   uint64_t m_lo, m_hi;
   // m = x * s;
-  asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s));
-  asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s));
+  wmul_64bit(m_hi, m_lo, x, s);
   if (m_lo < s) {
     uint64_t t = (-s) % s;  // (2^64 - s) mod s
     while (m_lo < t) {
       gen.next(x);
-      asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s));
-      asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s));
+      wmul_64bit(m_hi, m_lo, x, s);
     }
   }
   *val = OutType(m_hi) + params.start;
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(
+HDI void custom_next(
   GenType& gen, OutType* val, NormalDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
 {
   OutType res1, res2;
@@ -245,21 +245,18 @@ DI void custom_next(
 }
 
 template <typename GenType, typename IntType, typename LenType>
-DI void custom_next(GenType& gen,
-                    IntType* val,
-                    NormalIntDistParams<IntType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     IntType* val,
+                     NormalIntDistParams<IntType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
-  IntType res1_int, res2_int;
-
+  double res1, res2;
   do {
-    gen.next(res1_int);
-  } while (res1_int == 0);
+    gen.next(res1);
+  } while (res1 == double(0.0));
 
-  gen.next(res2_int);
-  double res1  = static_cast<double>(res1_int);
-  double res2  = static_cast<double>(res2_int);
+  gen.next(res2);
   double mu    = static_cast<double>(params.mu);
   double sigma = static_cast<double>(params.sigma);
   box_muller_transform<double>(res1, res2, sigma, mu);
@@ -268,11 +265,11 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    NormalTableDistParams<OutType, LenType> params,
-                    LenType idx,
-                    LenType stride)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     NormalTableDistParams<OutType, LenType> params,
+                     LenType idx,
+                     LenType stride)
 {
   OutType res1, res2;
 
@@ -293,7 +290,7 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename Type, typename LenType>
-DI void custom_next(
+HDI void custom_next(
   GenType& gen, OutType* val, BernoulliDistParams<Type> params, LenType idx = 0, LenType stride = 0)
 {
   Type res = 0;
@@ -302,11 +299,11 @@ DI void custom_next(
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    ScaledBernoulliDistParams<OutType> params,
-                    LenType idx,
-                    LenType stride)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     ScaledBernoulliDistParams<OutType> params,
+                     LenType idx,
+                     LenType stride)
 {
   OutType res = 0;
   gen.next(res);
@@ -314,7 +311,7 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(
+HDI void custom_next(
   GenType& gen, OutType* val, GumbelDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
 {
   OutType res = 0;
@@ -327,11 +324,11 @@ DI void custom_next(
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    LogNormalDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     LogNormalDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   OutType res1 = 0, res2 = 0;
   do {
@@ -345,11 +342,11 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    LogisticDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     LogisticDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   OutType res;
 
@@ -362,11 +359,11 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    ExponentialDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     ExponentialDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   OutType res;
   gen.next(res);
@@ -375,11 +372,11 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    RayleighDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     RayleighDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   OutType res;
   gen.next(res);
@@ -390,11 +387,11 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(GenType& gen,
-                    OutType* val,
-                    LaplaceDistParams<OutType> params,
-                    LenType idx    = 0,
-                    LenType stride = 0)
+HDI void custom_next(GenType& gen,
+                     OutType* val,
+                     LaplaceDistParams<OutType> params,
+                     LenType idx    = 0,
+                     LenType stride = 0)
 {
   OutType res, out;
 
@@ -417,7 +414,7 @@ DI void custom_next(GenType& gen,
 }
 
 template <typename GenType, typename OutType, typename LenType>
-DI void custom_next(
+HDI void custom_next(
   GenType& gen, OutType* val, SamplingParams<OutType, LenType> params, LenType idx, LenType stride)
 {
   OutType res;
@@ -536,36 +533,33 @@ struct PCGenerator {
   static constexpr auto GEN_TYPE = GeneratorType::GenPC;
 
   /**
-   * @brief ctor. Initializes the state for RNG. This code is derived from PCG basic code
-   * @param seed the seed (can be same across all threads). Same as PCG's initstate
-   * @param subsequence is same as PCG's initseq
-   * @param offset unused
+   * @brief ctor. Initializes the PCG
+   * @param rng_state is the generator state used for initializing the generator
+   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
+   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
+   * unique set of random numbers. This can be achieved by initializing the generator with same
+   * rng_state for all the threads and distinct values for subsequence.
    */
-  DI PCGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  HDI PCGenerator(const DeviceState<PCGenerator>& rng_state, const uint64_t subsequence)
   {
-    pcg_state = uint64_t(0);
-    inc       = (subsequence << 1u) | 1u;
-    uint32_t discard;
-    next(discard);
-    pcg_state += seed;
-    next(discard);
-    skipahead(offset);
+    _init_pcg(rng_state.seed, rng_state.base_subsequence + subsequence, subsequence);
   }
 
-  DI PCGenerator(const DeviceState<PCGenerator>& rng_state, const uint64_t subsequence)
+  /**
+   * @brief ctor. This is lower level constructor for PCG
+   * This code is derived from PCG basic code
+   * @param seed A 64-bit seed for the generator
+   * @param subsequence The id of subsequence that should be generated [0, 2^64-1]
+   * @param offset Initial `offset` number of items are skipped from the subsequence
+   */
+  HDI PCGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
   {
-    pcg_state = uint64_t(0);
-    inc       = ((rng_state.base_subsequence + subsequence) << 1u) | 1u;
-    uint32_t discard;
-    next(discard);
-    pcg_state += rng_state.seed;
-    next(discard);
-    skipahead(subsequence);
+    _init_pcg(seed, subsequence, offset);
   }
 
   // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
   // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
-  DI void skipahead(uint64_t offset)
+  HDI void skipahead(uint64_t offset)
   {
     uint64_t G = 1;
     uint64_t h = 6364136223846793005ULL;
@@ -588,7 +582,7 @@ struct PCGenerator {
    * @brief This code is derived from PCG basic code
    * @{
    */
-  DI uint32_t next_u32()
+  HDI uint32_t next_u32()
   {
     uint32_t ret;
     uint64_t oldstate   = pcg_state;
@@ -598,7 +592,7 @@ struct PCGenerator {
     ret                 = (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
     return ret;
   }
-  DI uint64_t next_u64()
+  HDI uint64_t next_u64()
   {
     uint64_t ret;
     uint32_t a, b;
@@ -608,7 +602,7 @@ struct PCGenerator {
     return ret;
   }
 
-  DI int32_t next_i32()
+  HDI int32_t next_i32()
   {
     int32_t ret;
     uint32_t val;
@@ -617,7 +611,7 @@ struct PCGenerator {
     return ret;
   }
 
-  DI int64_t next_i64()
+  HDI int64_t next_i64()
   {
     int64_t ret;
     uint64_t val;
@@ -626,7 +620,7 @@ struct PCGenerator {
     return ret;
   }
 
-  DI float next_float()
+  HDI float next_float()
   {
     float ret;
     uint32_t val = next_u32() >> 8;
@@ -634,7 +628,7 @@ struct PCGenerator {
     return ret;
   }
 
-  DI double next_double()
+  HDI double next_double()
   {
     double ret;
     uint64_t val = next_u64() >> 11;
@@ -642,17 +636,27 @@ struct PCGenerator {
     return ret;
   }
 
-  DI void next(uint32_t& ret) { ret = next_u32(); }
-  DI void next(uint64_t& ret) { ret = next_u64(); }
-  DI void next(int32_t& ret) { ret = next_i32(); }
-  DI void next(int64_t& ret) { ret = next_i64(); }
+  HDI void next(uint32_t& ret) { ret = next_u32(); }
+  HDI void next(uint64_t& ret) { ret = next_u64(); }
+  HDI void next(int32_t& ret) { ret = next_i32(); }
+  HDI void next(int64_t& ret) { ret = next_i64(); }
 
-  DI void next(float& ret) { ret = next_float(); }
-  DI void next(double& ret) { ret = next_double(); }
+  HDI void next(float& ret) { ret = next_float(); }
+  HDI void next(double& ret) { ret = next_double(); }
 
   /** @} */
 
  private:
+  HDI void _init_pcg(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
+    pcg_state = uint64_t(0);
+    inc       = (subsequence << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    pcg_state += seed;
+    next(discard);
+    skipahead(offset);
+  }
   uint64_t pcg_state;
   uint64_t inc;
 };
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index 6faab5381c..c14d4327c3 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -24,6 +24,7 @@
  */
 
 #include <limits>
+#include <raft/core/detail/macros.hpp>
 #include <stdexcept>
 #include <type_traits>
 
@@ -199,4 +200,38 @@ struct is_narrowing<From, To, std::void_t<decltype(To{std::declval<From>()})>> :
 template <typename From, typename To>
 inline constexpr bool is_narrowing_v = is_narrowing<From, To>::value;  // NOLINT
 
+/** Wide multiplication of two unsigned 64-bit integers */
+_RAFT_HOST_DEVICE inline void wmul_64bit(uint64_t& res_hi, uint64_t& res_lo, uint64_t a, uint64_t b)
+{
+#ifdef __CUDA_ARCH__
+  asm("mul.hi.u64 %0, %1, %2;" : "=l"(res_hi) : "l"(a), "l"(b));
+  asm("mul.lo.u64 %0, %1, %2;" : "=l"(res_lo) : "l"(a), "l"(b));
+#else
+  uint32_t a_hi, a_lo, b_hi, b_lo;
+
+  a_hi = uint32_t(a >> 32);
+  a_lo = uint32_t(a & uint64_t(0x00000000FFFFFFFF));
+  b_hi = uint32_t(b >> 32);
+  b_lo = uint32_t(b & uint64_t(0x00000000FFFFFFFF));
+
+  uint64_t t0 = uint64_t(a_lo) * uint64_t(b_lo);
+  uint64_t t1 = uint64_t(a_hi) * uint64_t(b_lo);
+  uint64_t t2 = uint64_t(a_lo) * uint64_t(b_hi);
+  uint64_t t3 = uint64_t(a_hi) * uint64_t(b_hi);
+
+  uint64_t carry = 0, trial = 0;
+
+  res_lo = t0;
+  trial  = res_lo + (t1 << 32);
+  if (trial < res_lo) carry++;
+  res_lo = trial;
+  trial  = res_lo + (t2 << 32);
+  if (trial < res_lo) carry++;
+  res_lo = trial;
+
+  // No need to worry about carry in this addition
+  res_hi = (t1 >> 32) + (t2 >> 32) + t3 + carry;
+#endif
+}
+
 }  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index deefd9644a..db4c59c807 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -269,6 +269,7 @@ if(BUILD_TESTS)
     test/random/make_blobs.cu
     test/random/make_regression.cu
     test/random/multi_variable_gaussian.cu
+    test/random/rng_pcg_host_api.cu
     test/random/permute.cu
     test/random/rng.cu
     test/random/rng_discrete.cu
@@ -425,6 +426,7 @@ if(BUILD_TESTS)
     test/util/cudart_utils.cpp
     test/util/device_atomics.cu
     test/util/integer_utils.cpp
+    test/util/integer_utils.cu
     test/util/pow2_utils.cu
     test/util/reduction.cu
   )
diff --git a/cpp/test/random/rng_pcg_host_api.cu b/cpp/test/random/rng_pcg_host_api.cu
new file mode 100644
index 0000000000..a0263f0586
--- /dev/null
+++ b/cpp/test/random/rng_pcg_host_api.cu
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+namespace raft {
+namespace random {
+
+// CPT - Calls Per Thread, How many calls to custom_next is made by a single thread
+// IPC - Items Per Call, How many items are returned by a single call to custom_next (usually IPC =
+// 1 or 2)
+template <typename DType, typename ParamType, int CPT, int IPC>
+__host__ __device__ void single_thread_fill(DType* buffer,
+                                            DeviceState<PCGenerator> r,
+                                            ParamType params,
+                                            const size_t total_threads,
+                                            const size_t len,
+                                            const size_t tid)
+{
+  PCGenerator gen(r, tid);
+
+  for (size_t i = 0; i < CPT; i++) {
+    DType val[IPC];
+    size_t index = (tid * CPT * IPC) + i * IPC;
+    custom_next(gen, val, params, index, total_threads);
+    for (int j = 0; j < IPC; j++) {
+      if (index + j < len) { buffer[index + j] = val[j]; }
+    }
+  }
+}
+
+template <typename DType, typename ParamType, int CPT, int IPC>
+__global__ void pcg_device_kernel(DType* buffer,
+                                  DeviceState<PCGenerator> r,
+                                  ParamType params,
+                                  const size_t total_threads,
+                                  const size_t len)
+{
+  int tid = int(blockIdx.x) * blockDim.x + threadIdx.x;
+
+  single_thread_fill<DType, ParamType, CPT, IPC>(buffer, r, params, total_threads, len, tid);
+}
+
+template <typename ParamType, typename DataType, int CPT, int IPC>
+class HostApiTest {
+ public:
+  HostApiTest() : stream(resource::get_cuda_stream(handle)), d_buffer(0, stream)
+  {
+    len = total_threads * CPT * IPC;
+    d_buffer.resize(len, stream);
+    h_buffer.resize(len);
+  }
+  void FillBuffers(uint64_t seed)
+  {
+    RngState r(seed, GenPC);
+    DeviceState<PCGenerator> d_state(r);
+
+    pcg_device_kernel<DataType, ParamType, CPT, IPC><<<n_blocks, n_threads, 0, stream>>>(
+      d_buffer.data(), d_state, dist_params, total_threads, len);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    for (size_t tid = 0; tid < len; tid++) {
+      single_thread_fill<DataType, ParamType, CPT, IPC>(
+        h_buffer.data(), d_state, dist_params, total_threads, len, tid);
+    }
+  }
+  void SetParams(ParamType _dist_params) { dist_params = _dist_params; }
+
+  void test()
+  {
+    ASSERT_TRUE(devArrMatchHost(
+      h_buffer.data(), d_buffer.data(), len, raft::CompareApprox<double>(1e-5), stream));
+  }
+  ParamType dist_params;
+  raft::resources handle;
+  cudaStream_t stream;
+
+  static const int n_blocks         = 128;
+  static const int n_threads        = 64;
+  static const size_t total_threads = size_t(n_blocks) * n_threads;
+
+  size_t len;
+  rmm::device_uvector<DataType> d_buffer;
+  std::vector<DataType> h_buffer;
+};
+
+// This Wrapper class is needed because gtest typed test allows single type per class
+template <typename T>
+class TestW : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    test_obj.SetParams(p);
+    test_obj.FillBuffers(seed);
+  }
+
+ public:
+  void TestFillBuffer() { test_obj.test(); }
+  T test_obj;
+  using ParamType = decltype(T::dist_params);
+  static ParamType p;
+  const uint64_t seed = 42;
+};
+
+TYPED_TEST_SUITE_P(TestW);
+
+TYPED_TEST_P(TestW, host_api_test) { this->TestFillBuffer(); }
+
+REGISTER_TYPED_TEST_SUITE_P(TestW, host_api_test);
+
+using InvariantT = HostApiTest<InvariantDistParams<int>, int, 16, 1>;
+template <>
+InvariantDistParams<int> TestW<InvariantT>::p = {.const_val = 123456};
+
+using UniformT = HostApiTest<UniformDistParams<double>, double, 16, 1>;
+template <>
+UniformDistParams<double> TestW<UniformT>::p = {.start = 0.0, .end = 1.0};
+
+using UniformInt32T = HostApiTest<UniformIntDistParams<uint32_t, uint32_t>, uint32_t, 16, 1>;
+template <>
+UniformIntDistParams<uint32_t, uint32_t> TestW<UniformInt32T>::p = {
+  .start = 0, .end = 100000, .diff = 100000};
+
+using UniformInt64T = HostApiTest<UniformIntDistParams<uint64_t, uint64_t>, uint64_t, 16, 1>;
+template <>
+UniformIntDistParams<uint64_t, uint64_t> TestW<UniformInt64T>::p = {
+  .start = 0, .end = 100000, .diff = 100000};
+
+using NormalT = HostApiTest<NormalDistParams<double>, double, 16, 2>;
+template <>
+NormalDistParams<double> TestW<NormalT>::p = {.mu = 0.5, .sigma = 0.5};
+
+using NormalIntT = HostApiTest<NormalIntDistParams<uint32_t>, uint32_t, 16, 2>;
+template <>
+NormalIntDistParams<uint32_t> TestW<NormalIntT>::p = {.mu = 10000000, .sigma = 10000};
+
+using BernoulliT = HostApiTest<BernoulliDistParams<double>, double, 16, 1>;
+template <>
+BernoulliDistParams<double> TestW<BernoulliT>::p = {.prob = 0.7};
+
+using ScaledBernoulliT = HostApiTest<ScaledBernoulliDistParams<double>, double, 16, 1>;
+template <>
+ScaledBernoulliDistParams<double> TestW<ScaledBernoulliT>::p = {.prob = 0.7, .scale = 0.5};
+
+using GumbelT = HostApiTest<GumbelDistParams<double>, double, 16, 1>;
+template <>
+GumbelDistParams<double> TestW<GumbelT>::p = {.mu = 0.7, .beta = 0.5};
+
+using LogNormalT = HostApiTest<LogNormalDistParams<double>, double, 16, 2>;
+template <>
+LogNormalDistParams<double> TestW<LogNormalT>::p = {.mu = 0.5, .sigma = 0.5};
+
+using LogisticT = HostApiTest<LogisticDistParams<double>, double, 16, 1>;
+template <>
+LogisticDistParams<double> TestW<LogisticT>::p = {.mu = 0.2, .scale = 0.3};
+
+using ExponentialT = HostApiTest<ExponentialDistParams<double>, double, 16, 1>;
+template <>
+ExponentialDistParams<double> TestW<ExponentialT>::p = {.lambda = 1.6};
+
+using RayleighT = HostApiTest<RayleighDistParams<double>, double, 16, 1>;
+template <>
+RayleighDistParams<double> TestW<RayleighT>::p = {.sigma = 1.6};
+
+using LaplaceT = HostApiTest<LaplaceDistParams<double>, double, 16, 1>;
+template <>
+LaplaceDistParams<double> TestW<LaplaceT>::p = {.mu = 0.2, .scale = 0.3};
+
+using TypeList = testing::Types<InvariantT,
+                                UniformT,
+                                UniformInt32T,
+                                UniformInt64T,
+                                NormalT,
+                                NormalIntT,
+                                BernoulliT,
+                                ScaledBernoulliT,
+                                GumbelT,
+                                LogisticT,
+                                ExponentialT,
+                                RayleighT,
+                                LaplaceT>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Rng, TestW, TypeList);
+
+}  // namespace random
+}  // namespace raft
diff --git a/cpp/test/util/integer_utils.cu b/cpp/test/util/integer_utils.cu
new file mode 100644
index 0000000000..7ea8d9528d
--- /dev/null
+++ b/cpp/test/util/integer_utils.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include <gtest/gtest.h>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/integer_utils.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace util {
+
+struct MulInputs {
+  uint64_t expected_high;
+  uint64_t expected_low;
+  uint64_t operand_1;
+  uint64_t operand_2;
+};
+
+__global__ void mul64_test_kernel(uint64_t* result_high,
+                                  uint64_t* result_low,
+                                  uint64_t* swapped_result_high,
+                                  uint64_t* swapped_result_low,
+                                  const uint64_t op1,
+                                  const uint64_t op2)
+{
+  using raft::wmul_64bit;
+  wmul_64bit(*result_high, *result_low, op1, op2);
+  wmul_64bit(*swapped_result_high, *swapped_result_low, op2, op1);
+}
+
+class Multiplication64bit : public testing::TestWithParam<MulInputs> {
+ protected:
+  Multiplication64bit()
+    : stream(resource::get_cuda_stream(handle)),
+      d_result_high(stream),
+      d_result_low(stream),
+      d_swapped_result_high(stream),
+      d_swapped_result_low(stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    using raft::wmul_64bit;
+    params = testing::TestWithParam<MulInputs>::GetParam();
+    wmul_64bit(result_high, result_low, params.operand_1, params.operand_2);
+    wmul_64bit(swapped_result_high, swapped_result_low, params.operand_2, params.operand_1);
+
+    mul64_test_kernel<<<1, 1, 0, stream>>>(d_result_high.data(),
+                                           d_result_low.data(),
+                                           d_swapped_result_high.data(),
+                                           d_swapped_result_low.data(),
+                                           params.operand_1,
+                                           params.operand_2);
+  }
+
+  raft::resources handle;
+  cudaStream_t stream;
+
+  rmm::device_scalar<uint64_t> d_result_high;
+  rmm::device_scalar<uint64_t> d_result_low;
+  rmm::device_scalar<uint64_t> d_swapped_result_high;
+  rmm::device_scalar<uint64_t> d_swapped_result_low;
+
+  MulInputs params;
+
+  uint64_t result_high;
+  uint64_t result_low;
+  uint64_t swapped_result_high;
+  uint64_t swapped_result_low;
+};
+
+const std::vector<MulInputs> inputs = {
+  {0ULL, 0ULL, 0ULL, 0ULL},
+  {0ULL, 0ULL, UINT64_MAX, 0ULL},
+  {0ULL, UINT64_MAX, UINT64_MAX, 1ULL},
+  {UINT64_MAX - 1, 1ULL, UINT64_MAX, UINT64_MAX},
+  {0x10759F98370FEC6EULL, 0xD5349806F735F69CULL, 0x1D6F160410C23D03ULL, 0x8F27C29767468634ULL},
+  {0xAF72C5B915A5ABDEULL >> 1, 0xAF72C5B915A5ABDEULL << 63, 0xAF72C5B915A5ABDEULL, 1ULL << 63},
+  {0xCA82AAEB81C01931ULL >> (64 - 23),
+   0xCA82AAEB81C01931ULL << 23,
+   0xCA82AAEB81C01931ULL,
+   1ULL << 23}};
+
+TEST_P(Multiplication64bit, Result)
+{
+  ASSERT_EQ(params.expected_high, d_result_high.value(stream));
+  ASSERT_EQ(params.expected_low, d_result_low.value(stream));
+  ASSERT_EQ(params.expected_high, d_swapped_result_high.value(stream));
+  ASSERT_EQ(params.expected_low, d_swapped_result_low.value(stream));
+
+  ASSERT_EQ(params.expected_high, result_high);
+  ASSERT_EQ(params.expected_low, result_low);
+  ASSERT_EQ(params.expected_high, swapped_result_high);
+  ASSERT_EQ(params.expected_low, swapped_result_low);
+}
+
+INSTANTIATE_TEST_CASE_P(Mul64bit, Multiplication64bit, testing::ValuesIn(inputs));
+
+};  // end of namespace util
+};  // end of namespace raft