From 1376774180c4cc62db2f9a51dc859144550dbd03 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Wed, 26 Jan 2022 16:31:20 -0800 Subject: [PATCH 1/4] Merge kernels into compute_bytes and cleanup kernel code --- .../cudf/detail/utilities/hash_functions.cuh | 163 ++++++------------ 1 file changed, 50 insertions(+), 113 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index b5ca5a3590e..f9023fcf32a 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -150,36 +150,47 @@ struct MurmurHash3_32 { template result_type __device__ inline compute(TKey const& key) const { - constexpr int len = sizeof(argument_type); - uint8_t const* const data = reinterpret_cast(&key); - constexpr int nblocks = len / 4; + return compute_bytes(reinterpret_cast(&key), sizeof(TKey)); + } + + result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const + { + cudf::size_type const nblocks = len / 4; + cudf::size_type const tail_offset = nblocks * 4; + result_type h1 = m_seed; + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; + constexpr uint32_t c3 = 0xe6546b64; + constexpr uint32_t rot_c1 = 15; + constexpr uint32_t rot_c2 = 13; + auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { + // Individual byte reads for unaligned accesses (very likely for strings) + auto q = (uint8_t const*)(p + i); + return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); + }; - uint32_t h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; //---------- // body - uint32_t const* const blocks = reinterpret_cast(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; // getblock32(blocks,i); + uint32_t const* const blocks = reinterpret_cast(data); + for (int i = 0; i < nblocks; i++) { + uint32_t k1 = getblock32(blocks, i); k1 *= c1; - k1 = rotl32(k1, 15); + k1 = rotl32(k1, rot_c1); k1 *= c2; h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; + h1 = rotl32(h1, rot_c2); + h1 = h1 * 5 + c3; } //---------- // tail - uint8_t const* tail = reinterpret_cast(data + nblocks * 4); - uint32_t k1 = 0; + uint32_t k1 = 0; switch (len & 3) { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; + case 3: k1 ^= std::to_integer(data[tail_offset + 2]) << 16; + case 2: k1 ^= std::to_integer(data[tail_offset + 1]) << 8; case 1: - k1 ^= tail[0]; + k1 ^= std::to_integer(data[tail_offset]); k1 *= c1; - k1 = rotl32(k1, 15); + k1 = rotl32(k1, rot_c1); k1 *= c2; h1 ^= k1; }; @@ -207,49 +218,7 @@ template <> hash_value_type __device__ inline MurmurHash3_32::operator()( cudf::string_view const& key) const { - auto const len = key.size_bytes(); - uint8_t const* data = reinterpret_cast(key.data()); - int const nblocks = len / 4; - result_type h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { - // Individual byte reads for unaligned accesses (very likely) - auto q = (uint8_t const*)(p + i); - return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); - }; - - //---------- - // body - uint32_t const* const blocks = reinterpret_cast(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = getblock32(blocks, i); - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; - } - //---------- - // tail - uint8_t const* tail = reinterpret_cast(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: - k1 ^= tail[0]; - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - }; - //---------- - // finalization - h1 ^= len; - h1 = fmix32(h1); - return h1; + return this->compute_bytes(reinterpret_cast(key.data()), key.size_bytes()); } template <> @@ -346,23 +315,30 @@ struct SparkMurmurHash3_32 { result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const { - constexpr cudf::size_type block_size = sizeof(uint32_t) / sizeof(std::byte); - cudf::size_type const nblocks = len / block_size; - uint32_t h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; + cudf::size_type const nblocks = len / 4; + uint32_t h1 = m_seed; + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; + constexpr uint32_t c3 = 0xe6546b64; + constexpr uint32_t rot_c1 = 15; + constexpr uint32_t rot_c2 = 13; + auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { + // Individual byte reads for unaligned accesses (very likely for strings) + auto q = (uint8_t const*)(p + i); + return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); + }; //---------- // Process all four-byte chunks uint32_t const* const blocks = reinterpret_cast(data); for (cudf::size_type i = 0; i < nblocks; i++) { - uint32_t k1 = blocks[i]; + uint32_t k1 = getblock32(blocks, i); k1 *= c1; - k1 = rotl32(k1, 15); + k1 = rotl32(k1, rot_c1); k1 *= c2; h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; + h1 = rotl32(h1, rot_c2); + h1 = h1 * 5 + c3; } //---------- // Process remaining bytes that do not fill a four-byte chunk using Spark's approach @@ -374,11 +350,11 @@ struct SparkMurmurHash3_32 { // signedness when casting byte-to-int, but C++ does not. uint32_t k1 = static_cast(std::to_integer(data[i])); k1 *= c1; - k1 = rotl32(k1, 15); + k1 = rotl32(k1, rot_c1); k1 *= c2; h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; + h1 = rotl32(h1, rot_c2); + h1 = h1 * 5 + c3; } //---------- // finalization @@ -501,46 +477,7 @@ template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()( cudf::string_view const& key) const { - auto const len = key.size_bytes(); - int8_t const* data = reinterpret_cast(key.data()); - int const nblocks = len / 4; - result_type h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { - // Individual byte reads for unaligned accesses (very likely) - auto q = (const uint8_t*)(p + i); - return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); - }; - - //---------- - // body - uint32_t const* const blocks = reinterpret_cast(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = getblock32(blocks, i); - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; - } - //---------- - // Spark's byte by byte tail processing - for (int i = nblocks * 4; i < len; i++) { - uint32_t k1 = data[i]; - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; - } - //---------- - // finalization - h1 ^= len; - h1 = fmix32(h1); - return h1; + return this->compute_bytes(reinterpret_cast(key.data()), key.size_bytes()); } template <> From db648d779c1c3243886afa31db0e9358b86bf38c Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Fri, 28 Jan 2022 14:52:08 -0800 Subject: [PATCH 2/4] formatting and code cleanup --- .../cudf/detail/utilities/hash_functions.cuh | 123 ++++++++++-------- 1 file changed, 67 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index f9023fcf32a..de1c444989a 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -105,6 +105,14 @@ struct MurmurHash3_32 { return h; } + [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, + cudf::size_type offset) const + { + // Individual byte reads for unaligned accesses (very likely for strings) + auto const q = reinterpret_cast(data + offset); + return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); + } + /* Copyright 2005-2014 Daniel James. * * Use, modification and distribution is subject to the Boost Software @@ -122,7 +130,7 @@ struct MurmurHash3_32 { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - __device__ inline result_type hash_combine(result_type lhs, result_type rhs) + [[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs) { result_type combined{lhs}; @@ -131,7 +139,10 @@ struct MurmurHash3_32 { return combined; } - result_type __device__ inline operator()(Key const& key) const { return compute(key); } + [[nodiscard]] result_type __device__ inline operator()(Key const& key) const + { + return compute(key); + } // compute wrapper for floating point types template ::value>* = nullptr> @@ -147,33 +158,27 @@ struct MurmurHash3_32 { } } - template - result_type __device__ inline compute(TKey const& key) const + template + result_type __device__ inline compute(T const& key) const { - return compute_bytes(reinterpret_cast(&key), sizeof(TKey)); + return compute_bytes(reinterpret_cast(&key), sizeof(T)); } - result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const + result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const { - cudf::size_type const nblocks = len / 4; - cudf::size_type const tail_offset = nblocks * 4; - result_type h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - constexpr uint32_t c3 = 0xe6546b64; - constexpr uint32_t rot_c1 = 15; - constexpr uint32_t rot_c2 = 13; - auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { - // Individual byte reads for unaligned accesses (very likely for strings) - auto q = (uint8_t const*)(p + i); - return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); - }; - - //---------- - // body - uint32_t const* const blocks = reinterpret_cast(data); - for (int i = 0; i < nblocks; i++) { - uint32_t k1 = getblock32(blocks, i); + constexpr cudf::size_type BLOCK_SIZE = 4; + cudf::size_type const nblocks = len / BLOCK_SIZE; + cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; + result_type h1 = m_seed; + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; + constexpr uint32_t c3 = 0xe6546b64; + constexpr uint32_t rot_c1 = 15; + constexpr uint32_t rot_c2 = 13; + + // Process all four-byte chunks. + for (cudf::size_type i = 0; i < nblocks; i++) { + uint32_t k1 = getblock32(data, i * BLOCK_SIZE); k1 *= c1; k1 = rotl32(k1, rot_c1); k1 *= c2; @@ -181,10 +186,10 @@ struct MurmurHash3_32 { h1 = rotl32(h1, rot_c2); h1 = h1 * 5 + c3; } - //---------- - // tail + + // Process remaining bytes that do not fill a four-byte chunk. uint32_t k1 = 0; - switch (len & 3) { + switch (len % 4) { case 3: k1 ^= std::to_integer(data[tail_offset + 2]) << 16; case 2: k1 ^= std::to_integer(data[tail_offset + 1]) << 8; case 1: @@ -194,8 +199,8 @@ struct MurmurHash3_32 { k1 *= c2; h1 ^= k1; }; - //---------- - // finalization + + // Finalize hash. h1 ^= len; h1 = fmix32(h1); return h1; @@ -218,7 +223,9 @@ template <> hash_value_type __device__ inline MurmurHash3_32::operator()( cudf::string_view const& key) const { - return this->compute_bytes(reinterpret_cast(key.data()), key.size_bytes()); + auto const data = reinterpret_cast(key.data()); + auto const len = key.size_bytes(); + return this->compute_bytes(data, len); } template <> @@ -307,32 +314,34 @@ struct SparkMurmurHash3_32 { } } - template - result_type __device__ inline compute(TKey const& key) const + template + result_type __device__ inline compute(T const& key) const { - return compute_bytes(reinterpret_cast(&key), sizeof(TKey)); + return compute_bytes(reinterpret_cast(&key), sizeof(T)); } - result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const + [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, + cudf::size_type offset) const { - cudf::size_type const nblocks = len / 4; - uint32_t h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - constexpr uint32_t c3 = 0xe6546b64; - constexpr uint32_t rot_c1 = 15; - constexpr uint32_t rot_c2 = 13; - auto getblock32 = [] __device__(uint32_t const* p, int i) -> uint32_t { - // Individual byte reads for unaligned accesses (very likely for strings) - auto q = (uint8_t const*)(p + i); - return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); - }; + // Individual byte reads for unaligned accesses (very likely for strings) + auto q = reinterpret_cast(data + offset); + return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); + } - //---------- - // Process all four-byte chunks - uint32_t const* const blocks = reinterpret_cast(data); + result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const + { + constexpr cudf::size_type BLOCK_SIZE = 4; + cudf::size_type const nblocks = len / BLOCK_SIZE; + result_type h1 = m_seed; + constexpr uint32_t c1 = 0xcc9e2d51; + constexpr uint32_t c2 = 0x1b873593; + constexpr uint32_t c3 = 0xe6546b64; + constexpr uint32_t rot_c1 = 15; + constexpr uint32_t rot_c2 = 13; + + // Process all four-byte chunks. for (cudf::size_type i = 0; i < nblocks; i++) { - uint32_t k1 = getblock32(blocks, i); + uint32_t k1 = getblock32(data, i * BLOCK_SIZE); k1 *= c1; k1 = rotl32(k1, rot_c1); k1 *= c2; @@ -340,9 +349,9 @@ struct SparkMurmurHash3_32 { h1 = rotl32(h1, rot_c2); h1 = h1 * 5 + c3; } - //---------- + // Process remaining bytes that do not fill a four-byte chunk using Spark's approach - // (does not conform to normal MurmurHash3) + // (does not conform to normal MurmurHash3). for (cudf::size_type i = nblocks * 4; i < len; i++) { // We require a two-step cast to get the k1 value from the byte. First, // we must cast to a signed int8_t. Then, the sign bit is preserved when @@ -356,8 +365,8 @@ struct SparkMurmurHash3_32 { h1 = rotl32(h1, rot_c2); h1 = h1 * 5 + c3; } - //---------- - // finalization + + // Finalize hash. h1 ^= len; h1 = fmix32(h1); return h1; @@ -477,7 +486,9 @@ template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()( cudf::string_view const& key) const { - return this->compute_bytes(reinterpret_cast(key.data()), key.size_bytes()); + auto const data = reinterpret_cast(key.data()); + auto const len = key.size_bytes(); + return this->compute_bytes(data, len); } template <> From 291ccb97d8f93bdeb47e443b8460ce1d74200d9e Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Fri, 28 Jan 2022 15:33:29 -0800 Subject: [PATCH 3/4] more formatting changes --- .../cudf/detail/utilities/hash_functions.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index de1c444989a..69a9179aa96 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -37,8 +37,8 @@ namespace detail { template T __device__ inline normalize_nans_and_zeros(T const& key) { - if constexpr (is_floating_point()) { - if (isnan(key)) { + if constexpr (cudf::is_floating_point()) { + if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } else if (key == T{0.0}) { return T{0.0}; @@ -84,8 +84,7 @@ void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destinatio // non-native version will be less than optimal. template struct MurmurHash3_32 { - using argument_type = Key; - using result_type = hash_value_type; + using result_type = hash_value_type; MurmurHash3_32() = default; constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} @@ -108,7 +107,8 @@ struct MurmurHash3_32 { [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const { - // Individual byte reads for unaligned accesses (very likely for strings) + // Read a 4-byte value from the data pointer as individual bytes for safe + // unaligned access (very likely for string types). auto const q = reinterpret_cast(data + offset); return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); } @@ -139,6 +139,7 @@ struct MurmurHash3_32 { return combined; } + // TODO Do we need this operator() and/or compute? Probably not both. [[nodiscard]] result_type __device__ inline operator()(Key const& key) const { return compute(key); @@ -150,7 +151,7 @@ struct MurmurHash3_32 { { if (key == T{0.0}) { return compute(T{0.0}); - } else if (isnan(key)) { + } else if (std::isnan(key)) { T nan = std::numeric_limits::quiet_NaN(); return compute(nan); } else { @@ -279,8 +280,7 @@ hash_value_type __device__ inline MurmurHash3_32::operator()( template struct SparkMurmurHash3_32 { - using argument_type = Key; - using result_type = hash_value_type; + using result_type = hash_value_type; SparkMurmurHash3_32() = default; constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} @@ -306,7 +306,7 @@ struct SparkMurmurHash3_32 { template ::value>* = nullptr> hash_value_type __device__ inline compute_floating_point(T const& key) const { - if (isnan(key)) { + if (std::isnan(key)) { T nan = std::numeric_limits::quiet_NaN(); return compute(nan); } else { From 5eaa8e6a32520f0ac43104ff7fe795375cb04efb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 2 Feb 2022 17:25:11 -0600 Subject: [PATCH 4/4] Apply suggestions from code review --- cpp/include/cudf/detail/utilities/hash_functions.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 69a9179aa96..51d58383de4 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -109,8 +109,8 @@ struct MurmurHash3_32 { { // Read a 4-byte value from the data pointer as individual bytes for safe // unaligned access (very likely for string types). - auto const q = reinterpret_cast(data + offset); - return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); + auto const block = reinterpret_cast(data + offset); + return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); } /* Copyright 2005-2014 Daniel James. @@ -324,8 +324,8 @@ struct SparkMurmurHash3_32 { cudf::size_type offset) const { // Individual byte reads for unaligned accesses (very likely for strings) - auto q = reinterpret_cast(data + offset); - return q[0] | (q[1] << 8) | (q[2] << 16) | (q[3] << 24); + auto block = reinterpret_cast(data + offset); + return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); } result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const