From 299f6cc11fb2d61d64bf0041c192e4d232e0f8e5 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 1 Apr 2021 02:52:41 -0700 Subject: [PATCH] Allow hash_partition to take a seed value (#7771) This PR is to allow hash partitioning to configure the seed of its hash function. As noted in #6307, using the same hash function in hash partitioning and join leads to a massive hash collision and severely degrades join performance on multiple GPUs. There was an initial fix (#6726) to this problem, but it added only the code path to use identity hash function in hash partitioning, which doesn't support complex data types and thus cannot be used in general. In fact, using the same general Murmur3 hash function with different seeds in hash partitioning and join turned out to be a sufficient fix. This PR is to enable such configurations by making `hash_partition` accept an optional seed value. Authors: - Wonchan Lee (https://github.com/magnatelee) Approvers: - https://github.com/gaohao95 - Mark Harris (https://github.com/harrism) - https://github.com/nvdbaranec - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7771 --- .../cudf/detail/utilities/hash_functions.cuh | 30 +++++++++++++------ cpp/include/cudf/hashing.hpp | 2 +- cpp/include/cudf/partitioning.hpp | 4 +++ cpp/include/cudf/table/row_operators.cuh | 27 ++++++++++++----- cpp/include/cudf/types.hpp | 5 ++++ cpp/src/partitioning/partitioning.cu | 13 ++++---- .../partitioning/hash_partition_test.cpp | 28 +++++++++++++++++ 7 files changed, 87 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index e79107e32cf..7f3c05134e2 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include using hash_value_type = uint32_t; @@ -231,6 +232,9 @@ MD5ListHasher::operator()(column_device_view data_col, } struct MD5Hash { + MD5Hash() = default; + constexpr MD5Hash(uint32_t seed) : m_seed(seed) {} + void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const { auto const full_length = (static_cast(hash_state->message_length)) << 3; @@ -302,6 +306,9 @@ struct MD5Hash { { md5_process(col.element(row_index), hash_state); } + + private: + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template <> @@ -372,7 +379,7 @@ struct MurmurHash3_32 { using result_type = hash_value_type; MurmurHash3_32() = default; - CUDA_HOST_DEVICE_CALLABLE MurmurHash3_32(uint32_t seed) : m_seed(seed) {} + constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const { @@ -469,7 +476,7 @@ struct MurmurHash3_32 { } private: - uint32_t m_seed{0}; + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template <> @@ -564,7 +571,7 @@ struct SparkMurmurHash3_32 { using result_type = hash_value_type; SparkMurmurHash3_32() = default; - CUDA_HOST_DEVICE_CALLABLE SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} + constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const { @@ -636,7 +643,7 @@ struct SparkMurmurHash3_32 { } private: - uint32_t m_seed{0}; + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template <> @@ -772,6 +779,8 @@ SparkMurmurHash3_32::operator()(double const& key) const template struct IdentityHash { using result_type = hash_value_type; + IdentityHash() = default; + constexpr IdentityHash(uint32_t seed) : m_seed(seed) {} /** * @brief Combines two hash values into a new single hash value. Called @@ -784,7 +793,7 @@ struct IdentityHash { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - CUDA_HOST_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs) const + constexpr result_type hash_combine(result_type lhs, result_type rhs) const { result_type combined{lhs}; @@ -794,19 +803,22 @@ struct IdentityHash { } template - CUDA_HOST_DEVICE_CALLABLE std::enable_if_t::value, return_type> - operator()(Key const& key) const + constexpr std::enable_if_t::value, return_type> operator()( + Key const& key) const { cudf_assert(false && "IdentityHash does not support this data type"); return 0; } template - CUDA_HOST_DEVICE_CALLABLE std::enable_if_t::value, return_type> - operator()(Key const& key) const + constexpr std::enable_if_t::value, return_type> operator()( + Key const& key) const { return static_cast(key); } + + private: + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 3f95b8b417b..0fb5002a953 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -39,7 +39,7 @@ std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, std::vector const& initial_hash = {}, - uint32_t seed = 0, + uint32_t seed = DEFAULT_HASH_SEED, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index ddde26ec762..6b1ad7db08b 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -83,6 +83,9 @@ std::pair, std::vector> partition( * @param input The table to partition * @param columns_to_hash Indices of input columns to hash * @param num_partitions The number of partitions to use + * @param hash_function Optional hash id that chooses the hash function to use + * @param seed Optional seed value to the hash function + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory. * * @returns An output table and a vector of row offsets to each partition @@ -92,6 +95,7 @@ std::pair, std::vector> hash_partition( std::vector const& columns_to_hash, int num_partitions, hash_id hash_function = hash_id::HASH_MURMUR3, + uint32_t seed = DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index decd2879f54..61d714c5538 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -428,6 +428,7 @@ template