diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 09d94d10e79..9c6f3e9cb13 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -31,11 +31,24 @@ #include #include +namespace cudf { + using hash_value_type = uint32_t; -namespace cudf { namespace detail { +/** + * Normalization of floating point NaNs, passthrough for all other values. + */ +template +T __device__ inline normalize_nans(T const& key) +{ + if constexpr (cudf::is_floating_point()) { + if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } + } + return key; +} + /** * Normalization of floating point NaNs and zeros, passthrough for all other values. */ @@ -43,13 +56,9 @@ template T __device__ inline normalize_nans_and_zeros(T const& key) { if constexpr (cudf::is_floating_point()) { - if (std::isnan(key)) { - return std::numeric_limits::quiet_NaN(); - } else if (key == T{0.0}) { - return T{0.0}; - } + if (key == T{0.0}) { return T{0.0}; } } - return key; + return normalize_nans(key); } __device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r) @@ -176,9 +185,6 @@ void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destinatio std::memcpy(destination, reinterpret_cast(&x), 8); } -} // namespace detail -} // namespace cudf - // MurmurHash3_32 implementation from // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp //----------------------------------------------------------------------------- @@ -192,7 +198,7 @@ template struct MurmurHash3_32 { using result_type = hash_value_type; - MurmurHash3_32() = default; + constexpr MurmurHash3_32() = default; constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const @@ -214,24 +220,9 @@ struct MurmurHash3_32 { return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); } - // TODO Do we need this operator() and/or compute? Probably not both. [[nodiscard]] result_type __device__ inline operator()(Key const& key) const { - return compute(key); - } - - // compute wrapper for floating point types - template >* = nullptr> - hash_value_type __device__ inline compute_floating_point(T const& key) const - { - if (key == T{0.0}) { - return compute(T{0.0}); - } else if (std::isnan(key)) { - T nan = std::numeric_limits::quiet_NaN(); - return compute(nan); - } else { - return compute(key); - } + return compute(detail::normalize_nans_and_zeros(key)); } template @@ -240,17 +231,32 @@ struct MurmurHash3_32 { return compute_bytes(reinterpret_cast(&key), sizeof(T)); } + result_type __device__ inline compute_remaining_bytes(std::byte const* data, + cudf::size_type len, + cudf::size_type tail_offset, + result_type h) const + { + // Process remaining bytes that do not fill a four-byte chunk. + uint32_t k1 = 0; + switch (len % 4) { + case 3: k1 ^= std::to_integer(data[tail_offset + 2]) << 16; [[fallthrough]]; + case 2: k1 ^= std::to_integer(data[tail_offset + 1]) << 8; [[fallthrough]]; + case 1: + k1 ^= std::to_integer(data[tail_offset]); + k1 *= c1; + k1 = cudf::detail::rotate_bits_left(k1, rot_c1); + k1 *= c2; + h ^= k1; + }; + return h; + } + result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const { constexpr cudf::size_type BLOCK_SIZE = 4; cudf::size_type const nblocks = len / BLOCK_SIZE; cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; - result_type h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - constexpr uint32_t c3 = 0xe6546b64; - constexpr uint32_t rot_c1 = 15; - constexpr uint32_t rot_c2 = 13; + result_type h = m_seed; // Process all four-byte chunks. for (cudf::size_type i = 0; i < nblocks; i++) { @@ -258,50 +264,44 @@ struct MurmurHash3_32 { k1 *= c1; k1 = cudf::detail::rotate_bits_left(k1, rot_c1); k1 *= c2; - h1 ^= k1; - h1 = cudf::detail::rotate_bits_left(h1, rot_c2); - h1 = h1 * 5 + c3; + h ^= k1; + h = cudf::detail::rotate_bits_left(h, rot_c2); + h = h * 5 + c3; } - // Process remaining bytes that do not fill a four-byte chunk. - uint32_t k1 = 0; - switch (len % 4) { - case 3: k1 ^= std::to_integer(data[tail_offset + 2]) << 16; - case 2: k1 ^= std::to_integer(data[tail_offset + 1]) << 8; - case 1: - k1 ^= std::to_integer(data[tail_offset]); - k1 *= c1; - k1 = cudf::detail::rotate_bits_left(k1, rot_c1); - k1 *= c2; - h1 ^= k1; - }; + h = compute_remaining_bytes(data, len, tail_offset, h); // Finalize hash. - h1 ^= len; - h1 = fmix32(h1); - return h1; + h ^= len; + h = fmix32(h); + return h; } private: uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; + static constexpr uint32_t c1 = 0xcc9e2d51; + static constexpr uint32_t c2 = 0x1b873593; + static constexpr uint32_t c3 = 0xe6546b64; + static constexpr uint32_t rot_c1 = 15; + static constexpr uint32_t rot_c2 = 13; }; template <> hash_value_type __device__ inline MurmurHash3_32::operator()(bool const& key) const { - return this->compute(static_cast(key)); + return compute(static_cast(key)); } template <> hash_value_type __device__ inline MurmurHash3_32::operator()(float const& key) const { - return this->compute_floating_point(key); + return compute(detail::normalize_nans_and_zeros(key)); } template <> hash_value_type __device__ inline MurmurHash3_32::operator()(double const& key) const { - return this->compute_floating_point(key); + return compute(detail::normalize_nans_and_zeros(key)); } template <> @@ -310,28 +310,28 @@ hash_value_type __device__ inline MurmurHash3_32::operator()( { auto const data = reinterpret_cast(key.data()); auto const len = key.size_bytes(); - return this->compute_bytes(data, len); + return compute_bytes(data, len); } template <> hash_value_type __device__ inline MurmurHash3_32::operator()( numeric::decimal32 const& key) const { - return this->compute(key.value()); + return compute(key.value()); } template <> hash_value_type __device__ inline MurmurHash3_32::operator()( numeric::decimal64 const& key) const { - return this->compute(key.value()); + return compute(key.value()); } template <> hash_value_type __device__ inline MurmurHash3_32::operator()( numeric::decimal128 const& key) const { - return this->compute(key.value()); + return compute(key.value()); } template <> @@ -352,10 +352,10 @@ template struct SparkMurmurHash3_32 { using result_type = hash_value_type; - SparkMurmurHash3_32() = default; + constexpr SparkMurmurHash3_32() = default; constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} - __device__ inline uint32_t fmix32(uint32_t h) const + [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const { h ^= h >> 16; h *= 0x85ebca6b; @@ -365,18 +365,18 @@ struct SparkMurmurHash3_32 { return h; } - result_type __device__ inline operator()(Key const& key) const { return compute(key); } + [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, + cudf::size_type offset) const + { + // Read a 4-byte value from the data pointer as individual bytes for safe + // unaligned access (very likely for string types). + auto block = reinterpret_cast(data + offset); + return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); + } - // compute wrapper for floating point types - template >* = nullptr> - hash_value_type __device__ inline compute_floating_point(T const& key) const + [[nodiscard]] result_type __device__ inline operator()(Key const& key) const { - if (std::isnan(key)) { - T nan = std::numeric_limits::quiet_NaN(); - return compute(nan); - } else { - return compute(key); - } + return compute(key); } template @@ -385,24 +385,35 @@ struct SparkMurmurHash3_32 { return compute_bytes(reinterpret_cast(&key), sizeof(T)); } - [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, - cudf::size_type offset) const + result_type __device__ inline compute_remaining_bytes(std::byte const* data, + cudf::size_type len, + cudf::size_type tail_offset, + result_type h) const { - // Individual byte reads for unaligned accesses (very likely for strings) - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); + // Process remaining bytes that do not fill a four-byte chunk using Spark's approach + // (does not conform to normal MurmurHash3). + for (auto i = tail_offset; i < len; i++) { + // We require a two-step cast to get the k1 value from the byte. First, + // we must cast to a signed int8_t. Then, the sign bit is preserved when + // casting to uint32_t under 2's complement. Java preserves the sign when + // casting byte-to-int, but C++ does not. + uint32_t k1 = static_cast(std::to_integer(data[i])); + k1 *= c1; + k1 = cudf::detail::rotate_bits_left(k1, rot_c1); + k1 *= c2; + h ^= k1; + h = cudf::detail::rotate_bits_left(h, rot_c2); + h = h * 5 + c3; + } + return h; } result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const { constexpr cudf::size_type BLOCK_SIZE = 4; cudf::size_type const nblocks = len / BLOCK_SIZE; - result_type h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - constexpr uint32_t c3 = 0xe6546b64; - constexpr uint32_t rot_c1 = 15; - constexpr uint32_t rot_c2 = 13; + cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; + result_type h = m_seed; // Process all four-byte chunks. for (cudf::size_type i = 0; i < nblocks; i++) { @@ -410,78 +421,69 @@ struct SparkMurmurHash3_32 { k1 *= c1; k1 = cudf::detail::rotate_bits_left(k1, rot_c1); k1 *= c2; - h1 ^= k1; - h1 = cudf::detail::rotate_bits_left(h1, rot_c2); - h1 = h1 * 5 + c3; + h ^= k1; + h = cudf::detail::rotate_bits_left(h, rot_c2); + h = h * 5 + c3; } - // Process remaining bytes that do not fill a four-byte chunk using Spark's approach - // (does not conform to normal MurmurHash3). - for (cudf::size_type i = nblocks * 4; i < len; i++) { - // We require a two-step cast to get the k1 value from the byte. First, - // we must cast to a signed int8_t. Then, the sign bit is preserved when - // casting to uint32_t under 2's complement. Java preserves the - // signedness when casting byte-to-int, but C++ does not. - uint32_t k1 = static_cast(std::to_integer(data[i])); - k1 *= c1; - k1 = cudf::detail::rotate_bits_left(k1, rot_c1); - k1 *= c2; - h1 ^= k1; - h1 = cudf::detail::rotate_bits_left(h1, rot_c2); - h1 = h1 * 5 + c3; - } + h = compute_remaining_bytes(data, len, tail_offset, h); // Finalize hash. - h1 ^= len; - h1 = fmix32(h1); - return h1; + h ^= len; + h = fmix32(h); + return h; } private: uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; + static constexpr uint32_t c1 = 0xcc9e2d51; + static constexpr uint32_t c2 = 0x1b873593; + static constexpr uint32_t c3 = 0xe6546b64; + static constexpr uint32_t rot_c1 = 15; + static constexpr uint32_t rot_c2 = 13; }; template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()(bool const& key) const { - return this->compute(key); + return compute(key); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()(int8_t const& key) const { - return this->compute(key); + return compute(key); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()(uint8_t const& key) const { - return this->compute(key); + return compute(key); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()(int16_t const& key) const { - return this->compute(key); + return compute(key); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()( uint16_t const& key) const { - return this->compute(key); + return compute(key); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()(float const& key) const { - return this->compute_floating_point(key); + return compute(detail::normalize_nans(key)); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()(double const& key) const { - return this->compute_floating_point(key); + return compute(detail::normalize_nans(key)); } template <> @@ -490,21 +492,21 @@ hash_value_type __device__ inline SparkMurmurHash3_32::operat { auto const data = reinterpret_cast(key.data()); auto const len = key.size_bytes(); - return this->compute_bytes(data, len); + return compute_bytes(data, len); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()( numeric::decimal32 const& key) const { - return this->compute(key.value()); + return compute(key.value()); } template <> hash_value_type __device__ inline SparkMurmurHash3_32::operator()( numeric::decimal64 const& key) const { - return this->compute(key.value()); + return compute(key.value()); } template <> @@ -546,7 +548,7 @@ hash_value_type __device__ inline SparkMurmurHash3_32::oper __int128_t big_endian_value = 0; auto big_endian_data = reinterpret_cast(&big_endian_value); thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data); - return this->compute_bytes(big_endian_data, length); + return compute_bytes(big_endian_data, length); } template <> @@ -593,3 +595,6 @@ struct IdentityHash { template using default_hash = MurmurHash3_32; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 44df981f5bf..f225afaec71 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -429,17 +429,19 @@ auto create_hash_map(table_device_view const& d_keys, size_type constexpr unused_key{std::numeric_limits::max()}; size_type constexpr unused_value{std::numeric_limits::max()}; - using map_type = concurrent_unordered_map, - row_equality_comparator>; + using map_type = + concurrent_unordered_map, + row_equality_comparator>; using allocator_type = typename map_type::allocator_type; auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - row_hasher hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys}; + row_hasher hasher{nullate::DYNAMIC{keys_have_nulls}, + d_keys}; row_equality_comparator rows_equal{ nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal}; diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index 76f3fba4689..9136410a03d 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -113,7 +113,7 @@ union pair_packer()>> { */ template , + typename Hasher = cudf::detail::default_hash, typename Equality = equal_to, typename Allocator = default_allocator>> class concurrent_unordered_map { diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 56a00191ae4..43411157319 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -356,7 +356,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin, ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)} : [&]() { auto const key_range = get_next_key(begin, end, opts.quotechar); - auto const key_hash = MurmurHash3_32{}( + auto const key_hash = cudf::detail::MurmurHash3_32{}( cudf::string_view(key_range.first, key_range.second - key_range.first)); auto const hash_col = col_map.find(key_hash); // Fall back to field index if not found (parsing error) @@ -667,7 +667,8 @@ __global__ void collect_keys_info_kernel(parse_options_view const options, keys_info->column(0).element(idx) = field_range.key_begin - data.begin(); keys_info->column(1).element(idx) = len; keys_info->column(2).element(idx) = - MurmurHash3_32{}(cudf::string_view(field_range.key_begin, len)); + cudf::detail::MurmurHash3_32{}( + cudf::string_view(field_range.key_begin, len)); } } } diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index f61cfa83579..45d0ea40a26 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -55,7 +55,10 @@ struct equality_functor { template struct hash_functor { column_device_view const& col; - __device__ auto operator()(size_type idx) { return MurmurHash3_32{}(col.element(idx)); } + __device__ auto operator()(size_type idx) const + { + return cudf::detail::MurmurHash3_32{}(col.element(idx)); + } }; struct map_insert_fn { diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 43686b7d257..09f07a1ca8c 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -779,10 +779,10 @@ std::pair, std::vector> hash_partition( if (!is_numeric(input.column(column_id).type())) CUDF_FAIL("IdentityHash does not support this data type"); } - return detail::local::hash_partition( + return detail::local::hash_partition( input, columns_to_hash, num_partitions, seed, stream, mr); case (hash_id::HASH_MURMUR3): - return detail::local::hash_partition( + return detail::local::hash_partition( input, columns_to_hash, num_partitions, seed, stream, mr); default: CUDF_FAIL("Unsupported hash function in hash_partition"); } diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index fb631b3f31f..404ecf1248c 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -144,8 +145,8 @@ struct byte_pair_encoding_fn { * @param rhs Second string. * @return The hash value to match with `d_map`. */ - __device__ hash_value_type compute_hash(cudf::string_view const& lhs, - cudf::string_view const& rhs) + __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs, + cudf::string_view const& rhs) { __shared__ char shmem[48 * 1024]; // max for Pascal auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1; diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 31cc29a8d8a..24b10fc4a36 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -36,12 +36,12 @@ namespace detail { using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; -using merge_pairs_map_type = cuco::static_map; -using string_hasher_type = MurmurHash3_32; +using string_hasher_type = cudf::detail::MurmurHash3_32; } // namespace detail diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index 31f579dc9d4..1e0c9c81fcd 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -42,7 +43,7 @@ struct make_pair_function { /** * @brief Hash the merge pair entry */ - __device__ cuco::pair_type operator()(cudf::size_type idx) + __device__ cuco::pair_type operator()(cudf::size_type idx) { auto const result = _hasher(d_strings.element(idx)); return cuco::make_pair(result, idx); @@ -105,9 +106,9 @@ std::unique_ptr initialize_merge_pairs_map( // Ensure capacity is at least (size/0.7) as documented here: // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( - static_cast(input.size() * 2), // capacity is 2x; - std::numeric_limits::max(), // empty key; - -1, // empty value is not used + static_cast(input.size() * 2), // capacity is 2x; + std::numeric_limits::max(), // empty key; + -1, // empty value is not used hash_table_allocator_type{default_allocator{}, stream}, stream.value()); @@ -117,8 +118,8 @@ std::unique_ptr initialize_merge_pairs_map( merge_pairs_map->insert(iter, iter + input.size(), - cuco::detail::MurmurHash3_32{}, - thrust::equal_to{}, + cuco::detail::MurmurHash3_32{}, + thrust::equal_to{}, stream.value()); return merge_pairs_map;