Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update MurmurHash3_x64_128 to use the cuco equivalent implementation #17457

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 24 additions & 139 deletions cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,211 +15,96 @@
*/
#pragma once

#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/hashing.hpp>
#include <cudf/hashing/detail/hash_functions.cuh>
#include <cudf/strings/string_view.cuh>

#include <thrust/pair.h>
#include <cuco/hash_functions.cuh>
#include <cuda/std/array>
#include <cuda/std/cstddef>

namespace cudf::hashing::detail {

// MurmurHash3_x64_128 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.
template <typename Key>
struct MurmurHash3_x64_128 {
using result_type = thrust::pair<uint64_t, uint64_t>;
using result_type = cuda::std::array<uint64_t, 2>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any benefit from this change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cuda::std::array is the proper type.

In the original implementation, the output was an array of two elements. Since cuda::std::array was not available when the hasher was introduced into libcudf, we used thrust::pair instead. Looking ahead, if we adopt more 128-bit hashers where the return type could consist of four 32-bit integers, cuda::std::array would still be a suitable choice, whereas thrust::pair would not.


constexpr MurmurHash3_x64_128() = default;
constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {}

__device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const
CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED)
: _impl{seed}
{
// Read a 4-byte value from the data pointer as individual bytes for safe
// unaligned access (very likely for string types).
auto block = reinterpret_cast<uint8_t const*>(data + offset);
return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
}

__device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const
{
uint64_t result = getblock32(data, offset + 4);
result = result << 32;
return result | getblock32(data, offset);
}
__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

__device__ inline uint64_t fmix64(uint64_t k) const
__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
std::uint64_t size) const
{
k ^= k >> 33;
k *= 0xff51afd7ed558ccdUL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53UL;
k ^= k >> 33;
return k;
return this->_impl.compute_hash(bytes, size);
}

result_type __device__ inline operator()(Key const& key) const { return compute(key); }

private:
template <typename T>
result_type __device__ inline compute(T const& key) const
{
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
}

result_type __device__ inline compute_remaining_bytes(std::byte const* data,
cudf::size_type len,
cudf::size_type tail_offset,
result_type h) const
{
// Process remaining bytes that do not fill a 8-byte chunk.
uint64_t k1 = 0;
uint64_t k2 = 0;
auto const tail = reinterpret_cast<uint8_t const*>(data) + tail_offset;
switch (len & (BLOCK_SIZE - 1)) {
case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48;
case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40;
case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32;
case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24;
case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16;
case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8;
case 9:
k2 ^= static_cast<uint64_t>(tail[8]) << 0;
k2 *= c2;
k2 = rotate_bits_left(k2, 33);
k2 *= c1;
h.second ^= k2;

case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56;
case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48;
case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40;
case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32;
case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24;
case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16;
case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8;
case 1:
k1 ^= static_cast<uint64_t>(tail[0]) << 0;
k1 *= c1;
k1 = rotate_bits_left(k1, 31);
k1 *= c2;
h.first ^= k1;
};
return h;
}

result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
__device__ constexpr result_type compute(T const& key) const
{
auto const nblocks = len / BLOCK_SIZE;
uint64_t h1 = m_seed;
uint64_t h2 = m_seed;

// Process all four-byte chunks.
for (cudf::size_type i = 0; i < nblocks; i++) {
uint64_t k1 = getblock64(data, (i * BLOCK_SIZE)); // 1st 8 bytes
uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2)); // 2nd 8 bytes

k1 *= c1;
k1 = rotate_bits_left(k1, 31);
k1 *= c2;

h1 ^= k1;
h1 = rotate_bits_left(h1, 27);
h1 += h2;
h1 = h1 * 5 + 0x52dce729;

k2 *= c2;
k2 = rotate_bits_left(k2, 33);
k2 *= c1;

h2 ^= k2;
h2 = rotate_bits_left(h2, 31);
h2 += h1;
h2 = h2 * 5 + 0x38495ab5;
}

thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2});

// Finalize hash.
h1 ^= len;
h2 ^= len;

h1 += h2;
h2 += h1;

h1 = fmix64(h1);
h2 = fmix64(h2);

h1 += h2;
h2 += h1;

return {h1, h2};
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
}

private:
uint64_t m_seed{};
static constexpr uint32_t BLOCK_SIZE = 16; // 2 x 64-bit = 16 bytes

static constexpr uint64_t c1 = 0x87c37b91114253d5UL;
static constexpr uint64_t c2 = 0x4cf5ad432745937fUL;
cuco::murmurhash3_x64_128<Key> _impl;
};

template <>
MurmurHash3_x64_128<bool>::result_type __device__ inline MurmurHash3_x64_128<bool>::operator()(
bool const& key) const
{
return compute<uint8_t>(key);
return this->compute<uint8_t>(key);
}

template <>
MurmurHash3_x64_128<float>::result_type __device__ inline MurmurHash3_x64_128<float>::operator()(
float const& key) const
{
return compute(normalize_nans(key));
return this->compute(normalize_nans(key));
}

template <>
MurmurHash3_x64_128<double>::result_type __device__ inline MurmurHash3_x64_128<double>::operator()(
double const& key) const
{
return compute(normalize_nans(key));
return this->compute(normalize_nans(key));
}

template <>
MurmurHash3_x64_128<cudf::string_view>::result_type
__device__ inline MurmurHash3_x64_128<cudf::string_view>::operator()(
cudf::string_view const& key) const
{
auto const data = reinterpret_cast<std::byte const*>(key.data());
auto const len = key.size_bytes();
return compute_bytes(data, len);
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
key.size_bytes());
}

template <>
MurmurHash3_x64_128<numeric::decimal32>::result_type
__device__ inline MurmurHash3_x64_128<numeric::decimal32>::operator()(
numeric::decimal32 const& key) const
{
return compute(key.value());
return this->compute(key.value());
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
}

template <>
MurmurHash3_x64_128<numeric::decimal64>::result_type
__device__ inline MurmurHash3_x64_128<numeric::decimal64>::operator()(
numeric::decimal64 const& key) const
{
return compute(key.value());
return this->compute(key.value());
}

template <>
MurmurHash3_x64_128<numeric::decimal128>::result_type
__device__ inline MurmurHash3_x64_128<numeric::decimal128>::operator()(
numeric::decimal128 const& key) const
{
return compute(key.value());
return this->compute(key.value());
}

} // namespace cudf::hashing::detail
2 changes: 1 addition & 1 deletion cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ template <typename Key>
struct MurmurHash3_x86_32 {
using result_type = hash_value_type;

__host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
: _impl{seed}
{
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/hashing/detail/xxhash_64.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ template <typename Key>
struct XXHash_64 {
using result_type = std::uint64_t;

__host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}

__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

Expand Down
17 changes: 9 additions & 8 deletions cpp/src/hash/murmurhash3_x64_128.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/array>
#include <thrust/for_each.h>

namespace cudf {
namespace hashing {
namespace detail {
namespace {

using hash_value_type = thrust::pair<uint64_t, uint64_t>;
using hash_value_type = cuda::std::array<uint64_t, 2>;

/**
* @brief Computes the hash value of a row in the given table.
Expand All @@ -58,16 +59,16 @@ class murmur_device_row_hasher {
*/
__device__ void operator()(size_type row_index) const noexcept
{
auto h = cudf::detail::accumulate(
auto const h = cudf::detail::accumulate(
_input.begin(),
_input.end(),
hash_value_type{_seed, 0},
[row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
});
_output1[row_index] = h.first;
_output2[row_index] = h.second;
_output1[row_index] = h[0];
_output2[row_index] = h[1];
}

/**
Expand All @@ -78,13 +79,13 @@ class murmur_device_row_hasher {
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const& col,
size_type row_index,
Nullate const _check_nulls,
hash_value_type const _seed) const noexcept
Nullate const check_nulls,
hash_value_type const seed) const noexcept
{
if (_check_nulls && col.is_null(row_index)) {
if (check_nulls && col.is_null(row_index)) {
return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
}
auto const hasher = MurmurHash3_x64_128<T>{_seed.first};
auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
return hasher(col.element<T>(row_index));
}

Expand Down
Loading