diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh index 7f994fb07f..d6999d9236 100644 --- a/cpp/include/raft/random/detail/rng_device.cuh +++ b/cpp/include/raft/random/detail/rng_device.cuh @@ -18,6 +18,7 @@ #include #include +#include #include @@ -139,9 +140,9 @@ struct SamplingParams { }; template -DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +HDI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) { - constexpr Type twoPi = Type(2.0) * Type(3.141592654); + constexpr Type twoPi = Type(2.0) * Type(3.141592653589793); constexpr Type minus2 = -Type(2.0); Type R = raft::sqrt(minus2 * raft::log(val1)); Type theta = twoPi * val2; @@ -152,27 +153,27 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type } template -DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +HDI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) { box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); } template -DI void custom_next(GenType& gen, - OutType* val, - InvariantDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + InvariantDistParams params, + LenType idx = 0, + LenType stride = 0) { *val = params.const_val; } template -DI void custom_next(GenType& gen, - OutType* val, - UniformDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + UniformDistParams params, + LenType idx = 0, + LenType stride = 0) { OutType res; gen.next(res); @@ -180,11 +181,11 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - UniformIntDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + UniformIntDistParams params, + LenType idx = 0, + LenType stride = 0) { uint32_t x = 0; uint32_t s = params.diff; @@ -203,32 +204,31 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - UniformIntDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + UniformIntDistParams params, + LenType idx = 0, + LenType stride = 0) { + using raft::wmul_64bit; uint64_t x = 0; gen.next(x); uint64_t s = params.diff; uint64_t m_lo, m_hi; // m = x * s; - asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s)); - asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s)); + wmul_64bit(m_hi, m_lo, x, s); if (m_lo < s) { uint64_t t = (-s) % s; // (2^64 - s) mod s while (m_lo < t) { gen.next(x); - asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s)); - asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s)); + wmul_64bit(m_hi, m_lo, x, s); } } *val = OutType(m_hi) + params.start; } template -DI void custom_next( +HDI void custom_next( GenType& gen, OutType* val, NormalDistParams params, LenType idx = 0, LenType stride = 0) { OutType res1, res2; @@ -245,21 +245,18 @@ DI void custom_next( } template -DI void custom_next(GenType& gen, - IntType* val, - NormalIntDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + IntType* val, + NormalIntDistParams params, + LenType idx = 0, + LenType stride = 0) { - IntType res1_int, res2_int; - + double res1, res2; do { - gen.next(res1_int); - } while (res1_int == 0); + gen.next(res1); + } while (res1 == double(0.0)); - gen.next(res2_int); - double res1 = static_cast(res1_int); - double res2 = static_cast(res2_int); + gen.next(res2); double mu = static_cast(params.mu); double sigma = static_cast(params.sigma); box_muller_transform(res1, res2, sigma, mu); @@ -268,11 +265,11 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - NormalTableDistParams params, - LenType idx, - LenType stride) +HDI void custom_next(GenType& gen, + OutType* val, + NormalTableDistParams params, + LenType idx, + LenType stride) { OutType res1, res2; @@ -293,7 +290,7 @@ DI void custom_next(GenType& gen, } template -DI void custom_next( +HDI void custom_next( GenType& gen, OutType* val, BernoulliDistParams params, LenType idx = 0, LenType stride = 0) { Type res = 0; @@ -302,11 +299,11 @@ DI void custom_next( } template -DI void custom_next(GenType& gen, - OutType* val, - ScaledBernoulliDistParams params, - LenType idx, - LenType stride) +HDI void custom_next(GenType& gen, + OutType* val, + ScaledBernoulliDistParams params, + LenType idx, + LenType stride) { OutType res = 0; gen.next(res); @@ -314,7 +311,7 @@ DI void custom_next(GenType& gen, } template -DI void custom_next( +HDI void custom_next( GenType& gen, OutType* val, GumbelDistParams params, LenType idx = 0, LenType stride = 0) { OutType res = 0; @@ -327,11 +324,11 @@ DI void custom_next( } template -DI void custom_next(GenType& gen, - OutType* val, - LogNormalDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + LogNormalDistParams params, + LenType idx = 0, + LenType stride = 0) { OutType res1 = 0, res2 = 0; do { @@ -345,11 +342,11 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - LogisticDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + LogisticDistParams params, + LenType idx = 0, + LenType stride = 0) { OutType res; @@ -362,11 +359,11 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - ExponentialDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + ExponentialDistParams params, + LenType idx = 0, + LenType stride = 0) { OutType res; gen.next(res); @@ -375,11 +372,11 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - RayleighDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + RayleighDistParams params, + LenType idx = 0, + LenType stride = 0) { OutType res; gen.next(res); @@ -390,11 +387,11 @@ DI void custom_next(GenType& gen, } template -DI void custom_next(GenType& gen, - OutType* val, - LaplaceDistParams params, - LenType idx = 0, - LenType stride = 0) +HDI void custom_next(GenType& gen, + OutType* val, + LaplaceDistParams params, + LenType idx = 0, + LenType stride = 0) { OutType res, out; @@ -417,7 +414,7 @@ DI void custom_next(GenType& gen, } template -DI void custom_next( +HDI void custom_next( GenType& gen, OutType* val, SamplingParams params, LenType idx, LenType stride) { OutType res; @@ -536,36 +533,33 @@ struct PCGenerator { static constexpr auto GEN_TYPE = GeneratorType::GenPC; /** - * @brief ctor. Initializes the state for RNG. This code is derived from PCG basic code - * @param seed the seed (can be same across all threads). Same as PCG's initstate - * @param subsequence is same as PCG's initseq - * @param offset unused + * @brief ctor. Initializes the PCG + * @param rng_state is the generator state used for initializing the generator + * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences + * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a + * unique set of random numbers. This can be achieved by initializing the generator with same + * rng_state for all the threads and distinct values for subsequence. */ - DI PCGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + HDI PCGenerator(const DeviceState& rng_state, const uint64_t subsequence) { - pcg_state = uint64_t(0); - inc = (subsequence << 1u) | 1u; - uint32_t discard; - next(discard); - pcg_state += seed; - next(discard); - skipahead(offset); + _init_pcg(rng_state.seed, rng_state.base_subsequence + subsequence, subsequence); } - DI PCGenerator(const DeviceState& rng_state, const uint64_t subsequence) + /** + * @brief ctor. This is lower level constructor for PCG + * This code is derived from PCG basic code + * @param seed A 64-bit seed for the generator + * @param subsequence The id of subsequence that should be generated [0, 2^64-1] + * @param offset Initial `offset` number of items are skipped from the subsequence + */ + HDI PCGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - pcg_state = uint64_t(0); - inc = ((rng_state.base_subsequence + subsequence) << 1u) | 1u; - uint32_t discard; - next(discard); - pcg_state += rng_state.seed; - next(discard); - skipahead(subsequence); + _init_pcg(seed, subsequence, offset); } // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf - DI void skipahead(uint64_t offset) + HDI void skipahead(uint64_t offset) { uint64_t G = 1; uint64_t h = 6364136223846793005ULL; @@ -588,7 +582,7 @@ struct PCGenerator { * @brief This code is derived from PCG basic code * @{ */ - DI uint32_t next_u32() + HDI uint32_t next_u32() { uint32_t ret; uint64_t oldstate = pcg_state; @@ -598,7 +592,7 @@ struct PCGenerator { ret = (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); return ret; } - DI uint64_t next_u64() + HDI uint64_t next_u64() { uint64_t ret; uint32_t a, b; @@ -608,7 +602,7 @@ struct PCGenerator { return ret; } - DI int32_t next_i32() + HDI int32_t next_i32() { int32_t ret; uint32_t val; @@ -617,7 +611,7 @@ struct PCGenerator { return ret; } - DI int64_t next_i64() + HDI int64_t next_i64() { int64_t ret; uint64_t val; @@ -626,7 +620,7 @@ struct PCGenerator { return ret; } - DI float next_float() + HDI float next_float() { float ret; uint32_t val = next_u32() >> 8; @@ -634,7 +628,7 @@ struct PCGenerator { return ret; } - DI double next_double() + HDI double next_double() { double ret; uint64_t val = next_u64() >> 11; @@ -642,17 +636,27 @@ struct PCGenerator { return ret; } - DI void next(uint32_t& ret) { ret = next_u32(); } - DI void next(uint64_t& ret) { ret = next_u64(); } - DI void next(int32_t& ret) { ret = next_i32(); } - DI void next(int64_t& ret) { ret = next_i64(); } + HDI void next(uint32_t& ret) { ret = next_u32(); } + HDI void next(uint64_t& ret) { ret = next_u64(); } + HDI void next(int32_t& ret) { ret = next_i32(); } + HDI void next(int64_t& ret) { ret = next_i64(); } - DI void next(float& ret) { ret = next_float(); } - DI void next(double& ret) { ret = next_double(); } + HDI void next(float& ret) { ret = next_float(); } + HDI void next(double& ret) { ret = next_double(); } /** @} */ private: + HDI void _init_pcg(uint64_t seed, uint64_t subsequence, uint64_t offset) + { + pcg_state = uint64_t(0); + inc = (subsequence << 1u) | 1u; + uint32_t discard; + next(discard); + pcg_state += seed; + next(discard); + skipahead(offset); + } uint64_t pcg_state; uint64_t inc; }; diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp index 6faab5381c..c14d4327c3 100644 --- a/cpp/include/raft/util/integer_utils.hpp +++ b/cpp/include/raft/util/integer_utils.hpp @@ -24,6 +24,7 @@ */ #include +#include #include #include @@ -199,4 +200,38 @@ struct is_narrowing()})>> : template inline constexpr bool is_narrowing_v = is_narrowing::value; // NOLINT +/** Wide multiplication of two unsigned 64-bit integers */ +_RAFT_HOST_DEVICE inline void wmul_64bit(uint64_t& res_hi, uint64_t& res_lo, uint64_t a, uint64_t b) +{ +#ifdef __CUDA_ARCH__ + asm("mul.hi.u64 %0, %1, %2;" : "=l"(res_hi) : "l"(a), "l"(b)); + asm("mul.lo.u64 %0, %1, %2;" : "=l"(res_lo) : "l"(a), "l"(b)); +#else + uint32_t a_hi, a_lo, b_hi, b_lo; + + a_hi = uint32_t(a >> 32); + a_lo = uint32_t(a & uint64_t(0x00000000FFFFFFFF)); + b_hi = uint32_t(b >> 32); + b_lo = uint32_t(b & uint64_t(0x00000000FFFFFFFF)); + + uint64_t t0 = uint64_t(a_lo) * uint64_t(b_lo); + uint64_t t1 = uint64_t(a_hi) * uint64_t(b_lo); + uint64_t t2 = uint64_t(a_lo) * uint64_t(b_hi); + uint64_t t3 = uint64_t(a_hi) * uint64_t(b_hi); + + uint64_t carry = 0, trial = 0; + + res_lo = t0; + trial = res_lo + (t1 << 32); + if (trial < res_lo) carry++; + res_lo = trial; + trial = res_lo + (t2 << 32); + if (trial < res_lo) carry++; + res_lo = trial; + + // No need to worry about carry in this addition + res_hi = (t1 >> 32) + (t2 >> 32) + t3 + carry; +#endif +} + } // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index deefd9644a..db4c59c807 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -269,6 +269,7 @@ if(BUILD_TESTS) test/random/make_blobs.cu test/random/make_regression.cu test/random/multi_variable_gaussian.cu + test/random/rng_pcg_host_api.cu test/random/permute.cu test/random/rng.cu test/random/rng_discrete.cu @@ -425,6 +426,7 @@ if(BUILD_TESTS) test/util/cudart_utils.cpp test/util/device_atomics.cu test/util/integer_utils.cpp + test/util/integer_utils.cu test/util/pow2_utils.cu test/util/reduction.cu ) diff --git a/cpp/test/random/rng_pcg_host_api.cu b/cpp/test/random/rng_pcg_host_api.cu new file mode 100644 index 0000000000..a0263f0586 --- /dev/null +++ b/cpp/test/random/rng_pcg_host_api.cu @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +namespace raft { +namespace random { + +// CPT - Calls Per Thread, How many calls to custom_next is made by a single thread +// IPC - Items Per Call, How many items are returned by a single call to custom_next (usually IPC = +// 1 or 2) +template +__host__ __device__ void single_thread_fill(DType* buffer, + DeviceState r, + ParamType params, + const size_t total_threads, + const size_t len, + const size_t tid) +{ + PCGenerator gen(r, tid); + + for (size_t i = 0; i < CPT; i++) { + DType val[IPC]; + size_t index = (tid * CPT * IPC) + i * IPC; + custom_next(gen, val, params, index, total_threads); + for (int j = 0; j < IPC; j++) { + if (index + j < len) { buffer[index + j] = val[j]; } + } + } +} + +template +__global__ void pcg_device_kernel(DType* buffer, + DeviceState r, + ParamType params, + const size_t total_threads, + const size_t len) +{ + int tid = int(blockIdx.x) * blockDim.x + threadIdx.x; + + single_thread_fill(buffer, r, params, total_threads, len, tid); +} + +template +class HostApiTest { + public: + HostApiTest() : stream(resource::get_cuda_stream(handle)), d_buffer(0, stream) + { + len = total_threads * CPT * IPC; + d_buffer.resize(len, stream); + h_buffer.resize(len); + } + void FillBuffers(uint64_t seed) + { + RngState r(seed, GenPC); + DeviceState d_state(r); + + pcg_device_kernel<<>>( + d_buffer.data(), d_state, dist_params, total_threads, len); + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + for (size_t tid = 0; tid < len; tid++) { + single_thread_fill( + h_buffer.data(), d_state, dist_params, total_threads, len, tid); + } + } + void SetParams(ParamType _dist_params) { dist_params = _dist_params; } + + void test() + { + ASSERT_TRUE(devArrMatchHost( + h_buffer.data(), d_buffer.data(), len, raft::CompareApprox(1e-5), stream)); + } + ParamType dist_params; + raft::resources handle; + cudaStream_t stream; + + static const int n_blocks = 128; + static const int n_threads = 64; + static const size_t total_threads = size_t(n_blocks) * n_threads; + + size_t len; + rmm::device_uvector d_buffer; + std::vector h_buffer; +}; + +// This Wrapper class is needed because gtest typed test allows single type per class +template +class TestW : public testing::Test { + protected: + void SetUp() override + { + test_obj.SetParams(p); + test_obj.FillBuffers(seed); + } + + public: + void TestFillBuffer() { test_obj.test(); } + T test_obj; + using ParamType = decltype(T::dist_params); + static ParamType p; + const uint64_t seed = 42; +}; + +TYPED_TEST_SUITE_P(TestW); + +TYPED_TEST_P(TestW, host_api_test) { this->TestFillBuffer(); } + +REGISTER_TYPED_TEST_SUITE_P(TestW, host_api_test); + +using InvariantT = HostApiTest, int, 16, 1>; +template <> +InvariantDistParams TestW::p = {.const_val = 123456}; + +using UniformT = HostApiTest, double, 16, 1>; +template <> +UniformDistParams TestW::p = {.start = 0.0, .end = 1.0}; + +using UniformInt32T = HostApiTest, uint32_t, 16, 1>; +template <> +UniformIntDistParams TestW::p = { + .start = 0, .end = 100000, .diff = 100000}; + +using UniformInt64T = HostApiTest, uint64_t, 16, 1>; +template <> +UniformIntDistParams TestW::p = { + .start = 0, .end = 100000, .diff = 100000}; + +using NormalT = HostApiTest, double, 16, 2>; +template <> +NormalDistParams TestW::p = {.mu = 0.5, .sigma = 0.5}; + +using NormalIntT = HostApiTest, uint32_t, 16, 2>; +template <> +NormalIntDistParams TestW::p = {.mu = 10000000, .sigma = 10000}; + +using BernoulliT = HostApiTest, double, 16, 1>; +template <> +BernoulliDistParams TestW::p = {.prob = 0.7}; + +using ScaledBernoulliT = HostApiTest, double, 16, 1>; +template <> +ScaledBernoulliDistParams TestW::p = {.prob = 0.7, .scale = 0.5}; + +using GumbelT = HostApiTest, double, 16, 1>; +template <> +GumbelDistParams TestW::p = {.mu = 0.7, .beta = 0.5}; + +using LogNormalT = HostApiTest, double, 16, 2>; +template <> +LogNormalDistParams TestW::p = {.mu = 0.5, .sigma = 0.5}; + +using LogisticT = HostApiTest, double, 16, 1>; +template <> +LogisticDistParams TestW::p = {.mu = 0.2, .scale = 0.3}; + +using ExponentialT = HostApiTest, double, 16, 1>; +template <> +ExponentialDistParams TestW::p = {.lambda = 1.6}; + +using RayleighT = HostApiTest, double, 16, 1>; +template <> +RayleighDistParams TestW::p = {.sigma = 1.6}; + +using LaplaceT = HostApiTest, double, 16, 1>; +template <> +LaplaceDistParams TestW::p = {.mu = 0.2, .scale = 0.3}; + +using TypeList = testing::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Rng, TestW, TypeList); + +} // namespace random +} // namespace raft diff --git a/cpp/test/util/integer_utils.cu b/cpp/test/util/integer_utils.cu new file mode 100644 index 0000000000..7ea8d9528d --- /dev/null +++ b/cpp/test/util/integer_utils.cu @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include +#include +#include +#include + +namespace raft { +namespace util { + +struct MulInputs { + uint64_t expected_high; + uint64_t expected_low; + uint64_t operand_1; + uint64_t operand_2; +}; + +__global__ void mul64_test_kernel(uint64_t* result_high, + uint64_t* result_low, + uint64_t* swapped_result_high, + uint64_t* swapped_result_low, + const uint64_t op1, + const uint64_t op2) +{ + using raft::wmul_64bit; + wmul_64bit(*result_high, *result_low, op1, op2); + wmul_64bit(*swapped_result_high, *swapped_result_low, op2, op1); +} + +class Multiplication64bit : public testing::TestWithParam { + protected: + Multiplication64bit() + : stream(resource::get_cuda_stream(handle)), + d_result_high(stream), + d_result_low(stream), + d_swapped_result_high(stream), + d_swapped_result_low(stream) + { + } + + protected: + void SetUp() override + { + using raft::wmul_64bit; + params = testing::TestWithParam::GetParam(); + wmul_64bit(result_high, result_low, params.operand_1, params.operand_2); + wmul_64bit(swapped_result_high, swapped_result_low, params.operand_2, params.operand_1); + + mul64_test_kernel<<<1, 1, 0, stream>>>(d_result_high.data(), + d_result_low.data(), + d_swapped_result_high.data(), + d_swapped_result_low.data(), + params.operand_1, + params.operand_2); + } + + raft::resources handle; + cudaStream_t stream; + + rmm::device_scalar d_result_high; + rmm::device_scalar d_result_low; + rmm::device_scalar d_swapped_result_high; + rmm::device_scalar d_swapped_result_low; + + MulInputs params; + + uint64_t result_high; + uint64_t result_low; + uint64_t swapped_result_high; + uint64_t swapped_result_low; +}; + +const std::vector inputs = { + {0ULL, 0ULL, 0ULL, 0ULL}, + {0ULL, 0ULL, UINT64_MAX, 0ULL}, + {0ULL, UINT64_MAX, UINT64_MAX, 1ULL}, + {UINT64_MAX - 1, 1ULL, UINT64_MAX, UINT64_MAX}, + {0x10759F98370FEC6EULL, 0xD5349806F735F69CULL, 0x1D6F160410C23D03ULL, 0x8F27C29767468634ULL}, + {0xAF72C5B915A5ABDEULL >> 1, 0xAF72C5B915A5ABDEULL << 63, 0xAF72C5B915A5ABDEULL, 1ULL << 63}, + {0xCA82AAEB81C01931ULL >> (64 - 23), + 0xCA82AAEB81C01931ULL << 23, + 0xCA82AAEB81C01931ULL, + 1ULL << 23}}; + +TEST_P(Multiplication64bit, Result) +{ + ASSERT_EQ(params.expected_high, d_result_high.value(stream)); + ASSERT_EQ(params.expected_low, d_result_low.value(stream)); + ASSERT_EQ(params.expected_high, d_swapped_result_high.value(stream)); + ASSERT_EQ(params.expected_low, d_swapped_result_low.value(stream)); + + ASSERT_EQ(params.expected_high, result_high); + ASSERT_EQ(params.expected_low, result_low); + ASSERT_EQ(params.expected_high, swapped_result_high); + ASSERT_EQ(params.expected_low, swapped_result_low); +} + +INSTANTIATE_TEST_CASE_P(Mul64bit, Multiplication64bit, testing::ValuesIn(inputs)); + +}; // end of namespace util +}; // end of namespace raft