From b67c0a97d16ec3c9d0abf825ad9755013b24ebab Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 2 Dec 2024 16:04:59 -0800 Subject: [PATCH] Update MurmurHash3_x64_128 to use the cuco equivalent implementation (#17457) This PR modifies MurmurHash3_x64_128 to utilize the cuco equivalent implementation, eliminating duplication. Authors: - Yunsong Wang (https://github.com/PointKernel) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17457 --- .../hashing/detail/murmurhash3_x64_128.cuh | 163 +++--------------- .../hashing/detail/murmurhash3_x86_32.cuh | 2 +- cpp/include/cudf/hashing/detail/xxhash_64.cuh | 2 +- cpp/src/hash/murmurhash3_x64_128.cu | 17 +- 4 files changed, 35 insertions(+), 149 deletions(-) diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh index 5e88b905023..31390aa3edf 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh @@ -15,177 +15,63 @@ */ #pragma once +#include +#include #include #include -#include +#include +#include +#include namespace cudf::hashing::detail { -// MurmurHash3_x64_128 implementation from -// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. template struct MurmurHash3_x64_128 { - using result_type = thrust::pair; + using result_type = cuda::std::array; - constexpr MurmurHash3_x64_128() = default; - constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {} - - __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const + CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED) + : _impl{seed} { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); } - __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const - { - uint64_t result = getblock32(data, offset + 4); - result = result << 32; - return result | getblock32(data, offset); - } + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } - __device__ inline uint64_t fmix64(uint64_t k) const + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const { - k ^= k >> 33; - k *= 0xff51afd7ed558ccdUL; - k ^= k >> 33; - k *= 0xc4ceb9fe1a85ec53UL; - k ^= k >> 33; - return k; + return this->_impl.compute_hash(bytes, size); } - result_type __device__ inline operator()(Key const& key) const { return compute(key); } - + private: template - result_type __device__ inline compute(T const& key) const - { - return compute_bytes(reinterpret_cast(&key), sizeof(T)); - } - - result_type __device__ inline compute_remaining_bytes(std::byte const* data, - cudf::size_type len, - cudf::size_type tail_offset, - result_type h) const - { - // Process remaining bytes that do not fill a 8-byte chunk. - uint64_t k1 = 0; - uint64_t k2 = 0; - auto const tail = reinterpret_cast(data) + tail_offset; - switch (len & (BLOCK_SIZE - 1)) { - case 15: k2 ^= static_cast(tail[14]) << 48; - case 14: k2 ^= static_cast(tail[13]) << 40; - case 13: k2 ^= static_cast(tail[12]) << 32; - case 12: k2 ^= static_cast(tail[11]) << 24; - case 11: k2 ^= static_cast(tail[10]) << 16; - case 10: k2 ^= static_cast(tail[9]) << 8; - case 9: - k2 ^= static_cast(tail[8]) << 0; - k2 *= c2; - k2 = rotate_bits_left(k2, 33); - k2 *= c1; - h.second ^= k2; - - case 8: k1 ^= static_cast(tail[7]) << 56; - case 7: k1 ^= static_cast(tail[6]) << 48; - case 6: k1 ^= static_cast(tail[5]) << 40; - case 5: k1 ^= static_cast(tail[4]) << 32; - case 4: k1 ^= static_cast(tail[3]) << 24; - case 3: k1 ^= static_cast(tail[2]) << 16; - case 2: k1 ^= static_cast(tail[1]) << 8; - case 1: - k1 ^= static_cast(tail[0]) << 0; - k1 *= c1; - k1 = rotate_bits_left(k1, 31); - k1 *= c2; - h.first ^= k1; - }; - return h; - } - - result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const + __device__ constexpr result_type compute(T const& key) const { - auto const nblocks = len / BLOCK_SIZE; - uint64_t h1 = m_seed; - uint64_t h2 = m_seed; - - // Process all four-byte chunks. - for (cudf::size_type i = 0; i < nblocks; i++) { - uint64_t k1 = getblock64(data, (i * BLOCK_SIZE)); // 1st 8 bytes - uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2)); // 2nd 8 bytes - - k1 *= c1; - k1 = rotate_bits_left(k1, 31); - k1 *= c2; - - h1 ^= k1; - h1 = rotate_bits_left(h1, 27); - h1 += h2; - h1 = h1 * 5 + 0x52dce729; - - k2 *= c2; - k2 = rotate_bits_left(k2, 33); - k2 *= c1; - - h2 ^= k2; - h2 = rotate_bits_left(h2, 31); - h2 += h1; - h2 = h2 * 5 + 0x38495ab5; - } - - thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2}); - - // Finalize hash. - h1 ^= len; - h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return {h1, h2}; + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); } - private: - uint64_t m_seed{}; - static constexpr uint32_t BLOCK_SIZE = 16; // 2 x 64-bit = 16 bytes - - static constexpr uint64_t c1 = 0x87c37b91114253d5UL; - static constexpr uint64_t c2 = 0x4cf5ad432745937fUL; + cuco::murmurhash3_x64_128 _impl; }; template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( bool const& key) const { - return compute(key); + return this->compute(key); } template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( float const& key) const { - return compute(normalize_nans(key)); + return this->compute(normalize_nans(key)); } template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( double const& key) const { - return compute(normalize_nans(key)); + return this->compute(normalize_nans(key)); } template <> @@ -193,9 +79,8 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( cudf::string_view const& key) const { - auto const data = reinterpret_cast(key.data()); - auto const len = key.size_bytes(); - return compute_bytes(data, len); + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); } template <> @@ -203,7 +88,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal32 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } template <> @@ -211,7 +96,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal64 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } template <> @@ -219,7 +104,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal128 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } } // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh index 38a7d927b9c..e0c7ce840d7 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh @@ -33,7 +33,7 @@ template struct MurmurHash3_x86_32 { using result_type = hash_value_type; - __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) + CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} { } diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh index 7d72349e340..d77d040b365 100644 --- a/cpp/include/cudf/hashing/detail/xxhash_64.cuh +++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh @@ -31,7 +31,7 @@ template struct XXHash_64 { using result_type = std::uint64_t; - __host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu index 090bd92af8c..43df7f325ac 100644 --- a/cpp/src/hash/murmurhash3_x64_128.cu +++ b/cpp/src/hash/murmurhash3_x64_128.cu @@ -24,6 +24,7 @@ #include #include +#include #include namespace cudf { @@ -31,7 +32,7 @@ namespace hashing { namespace detail { namespace { -using hash_value_type = thrust::pair; +using hash_value_type = cuda::std::array; /** * @brief Computes the hash value of a row in the given table. @@ -58,7 +59,7 @@ class murmur_device_row_hasher { */ __device__ void operator()(size_type row_index) const noexcept { - auto h = cudf::detail::accumulate( + auto const h = cudf::detail::accumulate( _input.begin(), _input.end(), hash_value_type{_seed, 0}, @@ -66,8 +67,8 @@ class murmur_device_row_hasher { return cudf::type_dispatcher( column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); }); - _output1[row_index] = h.first; - _output2[row_index] = h.second; + _output1[row_index] = h[0]; + _output2[row_index] = h[1]; } /** @@ -78,13 +79,13 @@ class murmur_device_row_hasher { template ())> __device__ hash_value_type operator()(column_device_view const& col, size_type row_index, - Nullate const _check_nulls, - hash_value_type const _seed) const noexcept + Nullate const check_nulls, + hash_value_type const seed) const noexcept { - if (_check_nulls && col.is_null(row_index)) { + if (check_nulls && col.is_null(row_index)) { return {std::numeric_limits::max(), std::numeric_limits::max()}; } - auto const hasher = MurmurHash3_x64_128{_seed.first}; + auto const hasher = MurmurHash3_x64_128{seed[0]}; return hasher(col.element(row_index)); }