Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Murmur3 hash kernel cleanup #10143

Merged
merged 5 commits into from
Feb 7, 2022
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 82 additions & 134 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ struct MurmurHash3_32 {
return h;
}

Copy link
Contributor

@bdice bdice Jan 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self for a future PR: Do we need MurmurHash3_32 to be a templated class? Currently the class takes a template parameter Key and has an operator()(Key key) with no template parameters which calls a templated compute(T key). However, the way it's called in row_operators.cuh seems to indicate that we could instead have a plain (non-template) class with a templated operator(). That's the way we typically do type dispatching, and it's reversed here for no clear reason. The calling code uses a type dispatch on element_hasher_with_seed.

(This would probably affect performance and/or compile time but I don't know if it would be better or worse.)

[[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
cudf::size_type offset) const
{
// Individual byte reads for unaligned accesses (very likely for strings)
auto const q = reinterpret_cast<uint8_t const*>(data + offset);
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24);
bdice marked this conversation as resolved.
Show resolved Hide resolved
}

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
Expand All @@ -122,7 +130,7 @@ struct MurmurHash3_32 {
*
* @returns A hash value that intelligently combines the lhs and rhs hash values
*/
__device__ inline result_type hash_combine(result_type lhs, result_type rhs)
[[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs)
{
result_type combined{lhs};

Expand All @@ -131,7 +139,10 @@ struct MurmurHash3_32 {
return combined;
}

result_type __device__ inline operator()(Key const& key) const { return compute(key); }
[[nodiscard]] result_type __device__ inline operator()(Key const& key) const
{
return compute(key);
}

// compute wrapper for floating point types
template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
Expand All @@ -147,44 +158,49 @@ struct MurmurHash3_32 {
}
}

template <typename TKey>
result_type __device__ inline compute(TKey const& key) const
template <typename T>
result_type __device__ inline compute(T const& key) const
{
constexpr int len = sizeof(argument_type);
uint8_t const* const data = reinterpret_cast<uint8_t const*>(&key);
constexpr int nblocks = len / 4;

uint32_t h1 = m_seed;
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
//----------
// body
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
for (int i = -nblocks; i; i++) {
uint32_t k1 = blocks[i]; // getblock32(blocks,i);
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
}

result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
{
constexpr cudf::size_type BLOCK_SIZE = 4;
cudf::size_type const nblocks = len / BLOCK_SIZE;
cudf::size_type const tail_offset = nblocks * BLOCK_SIZE;
result_type h1 = m_seed;
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
constexpr uint32_t c3 = 0xe6546b64;
constexpr uint32_t rot_c1 = 15;
constexpr uint32_t rot_c2 = 13;

// Process all four-byte chunks.
for (cudf::size_type i = 0; i < nblocks; i++) {
uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 = rotl32(k1, rot_c1);
Copy link
Contributor

@bdice bdice Jan 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future PR: We might define common functions and magic values between MurmurHash3_32 and SparkMurmurHash3_32 like rotl32 and fmix32 and getblock32 in a common base class, and only override the Spark-specific bits in a derived class. CRTP might be an even better choice, like I did for the SHA-family functions (draft #9215) - just needs a bit of analysis to decide which way to go.

k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
h1 = rotl32(h1, rot_c2);
h1 = h1 * 5 + c3;
}
//----------
// tail
uint8_t const* tail = reinterpret_cast<uint8_t const*>(data + nblocks * 4);
uint32_t k1 = 0;
switch (len & 3) {
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;

// Process remaining bytes that do not fill a four-byte chunk.
uint32_t k1 = 0;
switch (len % 4) {
case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16;
bdice marked this conversation as resolved.
Show resolved Hide resolved
case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8;
case 1:
k1 ^= tail[0];
k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 = rotl32(k1, rot_c1);
k1 *= c2;
h1 ^= k1;
};
//----------
// finalization

// Finalize hash.
h1 ^= len;
h1 = fmix32(h1);
return h1;
Expand All @@ -207,49 +223,9 @@ template <>
hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
cudf::string_view const& key) const
{
auto const len = key.size_bytes();
uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());
int const nblocks = len / 4;
result_type h1 = m_seed;
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t {
// Individual byte reads for unaligned accesses (very likely)
auto q = (uint8_t const*)(p + i);
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24);
};

//----------
// body
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
for (int i = -nblocks; i; i++) {
uint32_t k1 = getblock32(blocks, i);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
//----------
// tail
uint8_t const* tail = reinterpret_cast<uint8_t const*>(data + nblocks * 4);
uint32_t k1 = 0;
switch (len & 3) {
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1:
k1 ^= tail[0];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
};
//----------
// finalization
h1 ^= len;
h1 = fmix32(h1);
return h1;
auto const data = reinterpret_cast<std::byte const*>(key.data());
auto const len = key.size_bytes();
return this->compute_bytes(data, len);
}

template <>
Expand Down Expand Up @@ -338,50 +314,59 @@ struct SparkMurmurHash3_32 {
}
}

template <typename TKey>
result_type __device__ inline compute(TKey const& key) const
template <typename T>
result_type __device__ inline compute(T const& key) const
{
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(TKey));
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
}

result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const
[[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
cudf::size_type offset) const
{
constexpr cudf::size_type block_size = sizeof(uint32_t) / sizeof(std::byte);
cudf::size_type const nblocks = len / block_size;
uint32_t h1 = m_seed;
// Individual byte reads for unaligned accesses (very likely for strings)
auto q = reinterpret_cast<uint8_t const*>(data + offset);
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24);
bdice marked this conversation as resolved.
Show resolved Hide resolved
}

result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
{
constexpr cudf::size_type BLOCK_SIZE = 4;
cudf::size_type const nblocks = len / BLOCK_SIZE;
result_type h1 = m_seed;
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
constexpr uint32_t c3 = 0xe6546b64;
constexpr uint32_t rot_c1 = 15;
constexpr uint32_t rot_c2 = 13;

//----------
// Process all four-byte chunks
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data);
// Process all four-byte chunks.
for (cudf::size_type i = 0; i < nblocks; i++) {
uint32_t k1 = blocks[i];
uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 = rotl32(k1, rot_c1);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
h1 = rotl32(h1, rot_c2);
h1 = h1 * 5 + c3;
}
//----------

// Process remaining bytes that do not fill a four-byte chunk using Spark's approach
// (does not conform to normal MurmurHash3)
// (does not conform to normal MurmurHash3).
for (cudf::size_type i = nblocks * 4; i < len; i++) {
// We require a two-step cast to get the k1 value from the byte. First,
// we must cast to a signed int8_t. Then, the sign bit is preserved when
// casting to uint32_t under 2's complement. Java preserves the
// signedness when casting byte-to-int, but C++ does not.
uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
k1 *= c1;
k1 = rotl32(k1, 15);
k1 = rotl32(k1, rot_c1);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
h1 = rotl32(h1, rot_c2);
h1 = h1 * 5 + c3;
}
//----------
// finalization

// Finalize hash.
h1 ^= len;
h1 = fmix32(h1);
return h1;
Expand Down Expand Up @@ -501,46 +486,9 @@ template <>
hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
cudf::string_view const& key) const
{
auto const len = key.size_bytes();
int8_t const* data = reinterpret_cast<int8_t const*>(key.data());
int const nblocks = len / 4;
result_type h1 = m_seed;
constexpr uint32_t c1 = 0xcc9e2d51;
constexpr uint32_t c2 = 0x1b873593;
auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t {
// Individual byte reads for unaligned accesses (very likely)
auto q = (const uint8_t*)(p + i);
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24);
};

//----------
// body
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
for (int i = -nblocks; i; i++) {
uint32_t k1 = getblock32(blocks, i);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
//----------
// Spark's byte by byte tail processing
for (int i = nblocks * 4; i < len; i++) {
uint32_t k1 = data[i];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
//----------
// finalization
h1 ^= len;
h1 = fmix32(h1);
return h1;
auto const data = reinterpret_cast<std::byte const*>(key.data());
auto const len = key.size_bytes();
return this->compute_bytes(data, len);
}

template <>
Expand Down