-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Murmur3 hash kernel cleanup #10143
Murmur3 hash kernel cleanup #10143
Changes from 2 commits
1376774
db648d7
291ccb9
5eaa8e6
e5e4914
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -105,6 +105,14 @@ struct MurmurHash3_32 { | |
return h; | ||
} | ||
|
||
[[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, | ||
cudf::size_type offset) const | ||
{ | ||
// Individual byte reads for unaligned accesses (very likely for strings) | ||
auto const q = reinterpret_cast<uint8_t const*>(data + offset); | ||
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); | ||
bdice marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
/* Copyright 2005-2014 Daniel James. | ||
* | ||
* Use, modification and distribution is subject to the Boost Software | ||
|
@@ -122,7 +130,7 @@ struct MurmurHash3_32 { | |
* | ||
* @returns A hash value that intelligently combines the lhs and rhs hash values | ||
*/ | ||
__device__ inline result_type hash_combine(result_type lhs, result_type rhs) | ||
[[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs) | ||
{ | ||
result_type combined{lhs}; | ||
|
||
|
@@ -131,7 +139,10 @@ struct MurmurHash3_32 { | |
return combined; | ||
} | ||
|
||
result_type __device__ inline operator()(Key const& key) const { return compute(key); } | ||
[[nodiscard]] result_type __device__ inline operator()(Key const& key) const | ||
{ | ||
return compute(key); | ||
} | ||
|
||
// compute wrapper for floating point types | ||
template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr> | ||
|
@@ -147,44 +158,49 @@ struct MurmurHash3_32 { | |
} | ||
} | ||
|
||
template <typename TKey> | ||
result_type __device__ inline compute(TKey const& key) const | ||
template <typename T> | ||
result_type __device__ inline compute(T const& key) const | ||
{ | ||
constexpr int len = sizeof(argument_type); | ||
uint8_t const* const data = reinterpret_cast<uint8_t const*>(&key); | ||
constexpr int nblocks = len / 4; | ||
|
||
uint32_t h1 = m_seed; | ||
constexpr uint32_t c1 = 0xcc9e2d51; | ||
constexpr uint32_t c2 = 0x1b873593; | ||
//---------- | ||
// body | ||
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4); | ||
for (int i = -nblocks; i; i++) { | ||
uint32_t k1 = blocks[i]; // getblock32(blocks,i); | ||
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T)); | ||
} | ||
|
||
result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const | ||
{ | ||
constexpr cudf::size_type BLOCK_SIZE = 4; | ||
cudf::size_type const nblocks = len / BLOCK_SIZE; | ||
cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; | ||
result_type h1 = m_seed; | ||
constexpr uint32_t c1 = 0xcc9e2d51; | ||
constexpr uint32_t c2 = 0x1b873593; | ||
constexpr uint32_t c3 = 0xe6546b64; | ||
constexpr uint32_t rot_c1 = 15; | ||
constexpr uint32_t rot_c2 = 13; | ||
|
||
// Process all four-byte chunks. | ||
for (cudf::size_type i = 0; i < nblocks; i++) { | ||
uint32_t k1 = getblock32(data, i * BLOCK_SIZE); | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 = rotl32(k1, rot_c1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Future PR: We might define common functions and magic values between |
||
k1 *= c2; | ||
h1 ^= k1; | ||
h1 = rotl32(h1, 13); | ||
h1 = h1 * 5 + 0xe6546b64; | ||
h1 = rotl32(h1, rot_c2); | ||
h1 = h1 * 5 + c3; | ||
} | ||
//---------- | ||
// tail | ||
uint8_t const* tail = reinterpret_cast<uint8_t const*>(data + nblocks * 4); | ||
uint32_t k1 = 0; | ||
switch (len & 3) { | ||
case 3: k1 ^= tail[2] << 16; | ||
case 2: k1 ^= tail[1] << 8; | ||
|
||
// Process remaining bytes that do not fill a four-byte chunk. | ||
uint32_t k1 = 0; | ||
switch (len % 4) { | ||
case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16; | ||
bdice marked this conversation as resolved.
Show resolved
Hide resolved
|
||
case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8; | ||
case 1: | ||
k1 ^= tail[0]; | ||
k1 ^= std::to_integer<uint8_t>(data[tail_offset]); | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 = rotl32(k1, rot_c1); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
}; | ||
//---------- | ||
// finalization | ||
|
||
// Finalize hash. | ||
h1 ^= len; | ||
h1 = fmix32(h1); | ||
return h1; | ||
|
@@ -207,49 +223,9 @@ template <> | |
hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()( | ||
cudf::string_view const& key) const | ||
{ | ||
auto const len = key.size_bytes(); | ||
uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data()); | ||
int const nblocks = len / 4; | ||
result_type h1 = m_seed; | ||
constexpr uint32_t c1 = 0xcc9e2d51; | ||
constexpr uint32_t c2 = 0x1b873593; | ||
auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { | ||
// Individual byte reads for unaligned accesses (very likely) | ||
auto q = (uint8_t const*)(p + i); | ||
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); | ||
}; | ||
|
||
//---------- | ||
// body | ||
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4); | ||
for (int i = -nblocks; i; i++) { | ||
uint32_t k1 = getblock32(blocks, i); | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
h1 = rotl32(h1, 13); | ||
h1 = h1 * 5 + 0xe6546b64; | ||
} | ||
//---------- | ||
// tail | ||
uint8_t const* tail = reinterpret_cast<uint8_t const*>(data + nblocks * 4); | ||
uint32_t k1 = 0; | ||
switch (len & 3) { | ||
case 3: k1 ^= tail[2] << 16; | ||
case 2: k1 ^= tail[1] << 8; | ||
case 1: | ||
k1 ^= tail[0]; | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
}; | ||
//---------- | ||
// finalization | ||
h1 ^= len; | ||
h1 = fmix32(h1); | ||
return h1; | ||
auto const data = reinterpret_cast<std::byte const*>(key.data()); | ||
auto const len = key.size_bytes(); | ||
return this->compute_bytes(data, len); | ||
} | ||
|
||
template <> | ||
|
@@ -338,50 +314,59 @@ struct SparkMurmurHash3_32 { | |
} | ||
} | ||
|
||
template <typename TKey> | ||
result_type __device__ inline compute(TKey const& key) const | ||
template <typename T> | ||
result_type __device__ inline compute(T const& key) const | ||
{ | ||
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(TKey)); | ||
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T)); | ||
} | ||
|
||
result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const | ||
[[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, | ||
cudf::size_type offset) const | ||
{ | ||
constexpr cudf::size_type block_size = sizeof(uint32_t) / sizeof(std::byte); | ||
cudf::size_type const nblocks = len / block_size; | ||
uint32_t h1 = m_seed; | ||
// Individual byte reads for unaligned accesses (very likely for strings) | ||
auto q = reinterpret_cast<uint8_t const*>(data + offset); | ||
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); | ||
bdice marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const | ||
{ | ||
constexpr cudf::size_type BLOCK_SIZE = 4; | ||
cudf::size_type const nblocks = len / BLOCK_SIZE; | ||
result_type h1 = m_seed; | ||
constexpr uint32_t c1 = 0xcc9e2d51; | ||
constexpr uint32_t c2 = 0x1b873593; | ||
constexpr uint32_t c3 = 0xe6546b64; | ||
constexpr uint32_t rot_c1 = 15; | ||
constexpr uint32_t rot_c2 = 13; | ||
|
||
//---------- | ||
// Process all four-byte chunks | ||
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data); | ||
// Process all four-byte chunks. | ||
for (cudf::size_type i = 0; i < nblocks; i++) { | ||
uint32_t k1 = blocks[i]; | ||
uint32_t k1 = getblock32(data, i * BLOCK_SIZE); | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 = rotl32(k1, rot_c1); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
h1 = rotl32(h1, 13); | ||
h1 = h1 * 5 + 0xe6546b64; | ||
h1 = rotl32(h1, rot_c2); | ||
h1 = h1 * 5 + c3; | ||
} | ||
//---------- | ||
|
||
// Process remaining bytes that do not fill a four-byte chunk using Spark's approach | ||
// (does not conform to normal MurmurHash3) | ||
// (does not conform to normal MurmurHash3). | ||
for (cudf::size_type i = nblocks * 4; i < len; i++) { | ||
// We require a two-step cast to get the k1 value from the byte. First, | ||
// we must cast to a signed int8_t. Then, the sign bit is preserved when | ||
// casting to uint32_t under 2's complement. Java preserves the | ||
// signedness when casting byte-to-int, but C++ does not. | ||
uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i])); | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 = rotl32(k1, rot_c1); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
h1 = rotl32(h1, 13); | ||
h1 = h1 * 5 + 0xe6546b64; | ||
h1 = rotl32(h1, rot_c2); | ||
h1 = h1 * 5 + c3; | ||
} | ||
//---------- | ||
// finalization | ||
|
||
// Finalize hash. | ||
h1 ^= len; | ||
h1 = fmix32(h1); | ||
return h1; | ||
|
@@ -501,46 +486,9 @@ template <> | |
hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()( | ||
cudf::string_view const& key) const | ||
{ | ||
auto const len = key.size_bytes(); | ||
int8_t const* data = reinterpret_cast<int8_t const*>(key.data()); | ||
int const nblocks = len / 4; | ||
result_type h1 = m_seed; | ||
constexpr uint32_t c1 = 0xcc9e2d51; | ||
constexpr uint32_t c2 = 0x1b873593; | ||
auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { | ||
// Individual byte reads for unaligned accesses (very likely) | ||
auto q = (const uint8_t*)(p + i); | ||
return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); | ||
}; | ||
|
||
//---------- | ||
// body | ||
uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4); | ||
for (int i = -nblocks; i; i++) { | ||
uint32_t k1 = getblock32(blocks, i); | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
h1 = rotl32(h1, 13); | ||
h1 = h1 * 5 + 0xe6546b64; | ||
} | ||
//---------- | ||
// Spark's byte by byte tail processing | ||
for (int i = nblocks * 4; i < len; i++) { | ||
uint32_t k1 = data[i]; | ||
k1 *= c1; | ||
k1 = rotl32(k1, 15); | ||
k1 *= c2; | ||
h1 ^= k1; | ||
h1 = rotl32(h1, 13); | ||
h1 = h1 * 5 + 0xe6546b64; | ||
} | ||
//---------- | ||
// finalization | ||
h1 ^= len; | ||
h1 = fmix32(h1); | ||
return h1; | ||
auto const data = reinterpret_cast<std::byte const*>(key.data()); | ||
auto const len = key.size_bytes(); | ||
return this->compute_bytes(data, len); | ||
} | ||
|
||
template <> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note to self for a future PR: Do we need
MurmurHash3_32
to be a templated class? Currently the class takes a template parameterKey
and has anoperator()(Key key)
with no template parameters which calls a templatedcompute(T key)
. However, the way it's called inrow_operators.cuh
seems to indicate that we could instead have a plain (non-template) class with a templatedoperator()
. That's the way we typically do type dispatching, and it's reversed here for no clear reason. The calling code uses a type dispatch onelement_hasher_with_seed
.(This would probably affect performance and/or compile time but I don't know if it would be better or worse.)