diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 24778c9e37e..6eab13ae9af 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -27,6 +27,104 @@ using hash_value_type = uint32_t; namespace cudf { namespace detail { +namespace { +/** + * @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk, + * updating the hash value so far. Does not zero out the buffer contents. + */ +void CUDA_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state) +{ + uint32_t A = hash_state->hash_value[0]; + uint32_t B = hash_state->hash_value[1]; + uint32_t C = hash_state->hash_value[2]; + uint32_t D = hash_state->hash_value[3]; + + for (unsigned int j = 0; j < 64; j++) { + uint32_t F; + uint32_t g; + switch (j / 16) { + case 0: + F = (B & C) | ((~B) & D); + g = j; + break; + case 1: + F = (D & B) | ((~D) & C); + g = (5 * j + 1) % 16; + break; + case 2: + F = B ^ C ^ D; + g = (3 * j + 5) % 16; + break; + case 3: + F = C ^ (B | (~D)); + g = (7 * j) % 16; + break; + } + + uint32_t buffer_element_as_int; + std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4); + F = F + A + md5_hash_constants[j] + buffer_element_as_int; + A = D; + D = C; + C = B; + B = B + __funnelshift_l(F, F, md5_shift_constants[((j / 16) * 4) + (j % 4)]); + } + + hash_state->hash_value[0] += A; + hash_state->hash_value[1] += B; + hash_state->hash_value[2] += C; + hash_state->hash_value[3] += D; + + hash_state->buffer_length = 0; +} + +/** + * @brief Core MD5 element processing function + */ +template +void CUDA_DEVICE_CALLABLE md5_process(TKey const& key, md5_intermediate_data* hash_state) +{ + uint32_t const len = sizeof(TKey); + uint8_t const* data = reinterpret_cast(&key); + hash_state->message_length += len; + + // 64 bytes for the number of byt es processed in a given step + constexpr int md5_chunk_size = 64; + if (hash_state->buffer_length + len < md5_chunk_size) { + std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len); + hash_state->buffer_length += len; + } else { + uint32_t copylen = md5_chunk_size - hash_state->buffer_length; + + std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen); + md5_hash_step(hash_state); + + while (len > md5_chunk_size + copylen) { + std::memcpy(hash_state->buffer, data + copylen, md5_chunk_size); + md5_hash_step(hash_state); + copylen += md5_chunk_size; + } + + std::memcpy(hash_state->buffer, data + copylen, len - copylen); + hash_state->buffer_length = len - copylen; + } +} + +/** + * Normalization of floating point NANs and zeros helper + */ +template ::value>* = nullptr> +T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros_helper(T key) +{ + if (isnan(key)) { + return std::numeric_limits::quiet_NaN(); + } else if (key == T{0.0}) { + return T{0.0}; + } else { + return key; + } +} +} // namespace /** * Modified GPU implementation of @@ -51,6 +149,217 @@ void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destina std::memcpy(destination, reinterpret_cast(&x), 8); } +struct MD5ListHasher { + template ()>* = nullptr> + void __device__ operator()(column_device_view data_col, + size_type offset_begin, + size_type offset_end, + md5_intermediate_data* hash_state) const + { + cudf_assert(false && "MD5 Unsupported chrono type column"); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view data_col, + size_type offset_begin, + size_type offset_end, + md5_intermediate_data* hash_state) const + { + cudf_assert(false && "MD5 Unsupported non-fixed-width type column"); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view data_col, + size_type offset_begin, + size_type offset_end, + md5_intermediate_data* hash_state) const + { + for (int i = offset_begin; i < offset_end; i++) { + if (!data_col.is_null(i)) { + md5_process(normalize_nans_and_zeros_helper(data_col.element(i)), hash_state); + } + } + } + + template < + typename T, + std::enable_if_t() && !is_floating_point() && !is_chrono()>* = nullptr> + void CUDA_DEVICE_CALLABLE operator()(column_device_view data_col, + size_type offset_begin, + size_type offset_end, + md5_intermediate_data* hash_state) const + { + for (int i = offset_begin; i < offset_end; i++) { + if (!data_col.is_null(i)) md5_process(data_col.element(i), hash_state); + } + } +}; + +template <> +void CUDA_DEVICE_CALLABLE +MD5ListHasher::operator()(column_device_view data_col, + size_type offset_begin, + size_type offset_end, + md5_intermediate_data* hash_state) const +{ + for (int i = offset_begin; i < offset_end; i++) { + if (!data_col.is_null(i)) { + string_view key = data_col.element(i); + uint32_t const len = static_cast(key.size_bytes()); + uint8_t const* data = reinterpret_cast(key.data()); + + hash_state->message_length += len; + + if (hash_state->buffer_length + len < 64) { + std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len); + hash_state->buffer_length += len; + } else { + uint32_t copylen = 64 - hash_state->buffer_length; + std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen); + md5_hash_step(hash_state); + + while (len > 64 + copylen) { + std::memcpy(hash_state->buffer, data + copylen, 64); + md5_hash_step(hash_state); + copylen += 64; + } + + std::memcpy(hash_state->buffer, data + copylen, len - copylen); + hash_state->buffer_length = len - copylen; + } + } + } +} + +struct MD5Hash { + MD5Hash() = default; + constexpr MD5Hash(uint32_t seed) : m_seed(seed) {} + + void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const + { + auto const full_length = (static_cast(hash_state->message_length)) << 3; + thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80); + + // 64 bytes for the number of bytes processed in a given step + constexpr int md5_chunk_size = 64; + // 8 bytes for the total message length, appended to the end of the last chunk processed + constexpr int message_length_size = 8; + // 1 byte for the end of the message flag + constexpr int end_of_message_size = 1; + if (hash_state->buffer_length + message_length_size + end_of_message_size <= md5_chunk_size) { + thrust::fill_n( + thrust::seq, + hash_state->buffer + hash_state->buffer_length + 1, + (md5_chunk_size - message_length_size - end_of_message_size - hash_state->buffer_length), + 0x00); + } else { + thrust::fill_n(thrust::seq, + hash_state->buffer + hash_state->buffer_length + 1, + (md5_chunk_size - hash_state->buffer_length), + 0x00); + md5_hash_step(hash_state); + + thrust::fill_n(thrust::seq, hash_state->buffer, md5_chunk_size - message_length_size, 0x00); + } + + std::memcpy(hash_state->buffer + md5_chunk_size - message_length_size, + reinterpret_cast(&full_length), + message_length_size); + md5_hash_step(hash_state); + +#pragma unroll + for (int i = 0; i < 4; ++i) + uint32ToLowercaseHexString(hash_state->hash_value[i], result_location + (8 * i)); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + cudf_assert(false && "MD5 Unsupported chrono type column"); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + cudf_assert(false && "MD5 Unsupported non-fixed-width type column"); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + md5_process(normalize_nans_and_zeros_helper(col.element(row_index)), hash_state); + } + + template < + typename T, + std::enable_if_t() && !is_floating_point() && !is_chrono()>* = nullptr> + void CUDA_DEVICE_CALLABLE operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + md5_process(col.element(row_index), hash_state); + } + + private: + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; +}; + +template <> +void CUDA_DEVICE_CALLABLE MD5Hash::operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const +{ + string_view key = col.element(row_index); + uint32_t const len = static_cast(key.size_bytes()); + uint8_t const* data = reinterpret_cast(key.data()); + + hash_state->message_length += len; + + if (hash_state->buffer_length + len < 64) { + std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len); + hash_state->buffer_length += len; + } else { + uint32_t copylen = 64 - hash_state->buffer_length; + std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen); + md5_hash_step(hash_state); + + while (len > 64 + copylen) { + std::memcpy(hash_state->buffer, data + copylen, 64); + md5_hash_step(hash_state); + copylen += 64; + } + + std::memcpy(hash_state->buffer, data + copylen, len - copylen); + hash_state->buffer_length = len - copylen; + } +} + +template <> +void CUDA_DEVICE_CALLABLE MD5Hash::operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const +{ + static constexpr size_type offsets_column_index{0}; + static constexpr size_type data_column_index{1}; + + column_device_view offsets = col.child(offsets_column_index); + column_device_view data = col.child(data_column_index); + + if (data.type().id() == type_id::LIST) cudf_assert(false && "Nested list unsupported"); + + cudf::type_dispatcher(data.type(), + MD5ListHasher{}, + data, + offsets.element(row_index), + offsets.element(row_index + 1), + hash_state); +} } // namespace detail } // namespace cudf diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index 796aff001d9..973f3204c37 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -27,9 +27,6 @@ #include namespace cudf { - -namespace detail { - namespace { // MD5 supported leaf data type check @@ -38,322 +35,15 @@ bool md5_type_check(data_type dt) return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING)); } -/** - * @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk, - * updating the hash value so far. Does not zero out the buffer contents. - */ -void CUDA_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state) -{ - uint32_t A = hash_state->hash_value[0]; - uint32_t B = hash_state->hash_value[1]; - uint32_t C = hash_state->hash_value[2]; - uint32_t D = hash_state->hash_value[3]; - - for (unsigned int j = 0; j < 64; j++) { - uint32_t F; - uint32_t g; - switch (j / 16) { - case 0: - F = (B & C) | ((~B) & D); - g = j; - break; - case 1: - F = (D & B) | ((~D) & C); - g = (5 * j + 1) % 16; - break; - case 2: - F = B ^ C ^ D; - g = (3 * j + 5) % 16; - break; - case 3: - F = C ^ (B | (~D)); - g = (7 * j) % 16; - break; - } - - uint32_t buffer_element_as_int; - std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4); - F = F + A + md5_hash_constants[j] + buffer_element_as_int; - A = D; - D = C; - C = B; - B = B + __funnelshift_l(F, F, md5_shift_constants[((j / 16) * 4) + (j % 4)]); - } - - hash_state->hash_value[0] += A; - hash_state->hash_value[1] += B; - hash_state->hash_value[2] += C; - hash_state->hash_value[3] += D; - - hash_state->buffer_length = 0; -} - -/** - * @brief Core MD5 element processing function - */ -template -void CUDA_DEVICE_CALLABLE md5_process(TKey const& key, md5_intermediate_data* hash_state) -{ - uint32_t const len = sizeof(TKey); - uint8_t const* data = reinterpret_cast(&key); - hash_state->message_length += len; - - // 64 bytes are processed in each hash step - constexpr int md5_chunk_size = 64; - if (hash_state->buffer_length + len < md5_chunk_size) { - std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len); - hash_state->buffer_length += len; - } else { - uint32_t copylen = md5_chunk_size - hash_state->buffer_length; - - std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen); - md5_hash_step(hash_state); - - while (len > md5_chunk_size + copylen) { - std::memcpy(hash_state->buffer, data + copylen, md5_chunk_size); - md5_hash_step(hash_state); - copylen += md5_chunk_size; - } - - std::memcpy(hash_state->buffer, data + copylen, len - copylen); - hash_state->buffer_length = len - copylen; - } -} - -/** - * Normalization of floating point NANs and zeros helper - */ -template ::value>* = nullptr> -T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros_helper(T key) -{ - if (isnan(key)) { - return std::numeric_limits::quiet_NaN(); - } else if (key == T{0.0}) { - return T{0.0}; - } else { - return key; - } -} - -struct MD5ListHasher { - template ()>* = nullptr> - void __device__ operator()(column_device_view data_col, - size_type offset_begin, - size_type offset_end, - md5_intermediate_data* hash_state) const - { - cudf_assert(false && "MD5 Unsupported chrono type column"); - } - - template ()>* = nullptr> - void __device__ operator()(column_device_view data_col, - size_type offset_begin, - size_type offset_end, - md5_intermediate_data* hash_state) const - { - cudf_assert(false && "MD5 Unsupported non-fixed-width type column"); - } - - template ()>* = nullptr> - void __device__ operator()(column_device_view data_col, - size_type offset_begin, - size_type offset_end, - md5_intermediate_data* hash_state) const - { - for (int i = offset_begin; i < offset_end; i++) { - if (!data_col.is_null(i)) { - md5_process(normalize_nans_and_zeros_helper(data_col.element(i)), hash_state); - } - } - } - - template < - typename T, - std::enable_if_t() && !is_floating_point() && !is_chrono()>* = nullptr> - void CUDA_DEVICE_CALLABLE operator()(column_device_view data_col, - size_type offset_begin, - size_type offset_end, - md5_intermediate_data* hash_state) const - { - for (int i = offset_begin; i < offset_end; i++) { - if (!data_col.is_null(i)) md5_process(data_col.element(i), hash_state); - } - } -}; - -template <> -void CUDA_DEVICE_CALLABLE -MD5ListHasher::operator()(column_device_view data_col, - size_type offset_begin, - size_type offset_end, - md5_intermediate_data* hash_state) const -{ - for (int i = offset_begin; i < offset_end; i++) { - if (!data_col.is_null(i)) { - string_view key = data_col.element(i); - uint32_t const len = static_cast(key.size_bytes()); - uint8_t const* data = reinterpret_cast(key.data()); - - hash_state->message_length += len; - - // 64 bytes are processed in each hash step - constexpr int md5_chunk_size = 64; - if (hash_state->buffer_length + len < md5_chunk_size) { - std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len); - hash_state->buffer_length += len; - } else { - uint32_t copylen = md5_chunk_size - hash_state->buffer_length; - std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen); - md5_hash_step(hash_state); - - while (len > md5_chunk_size + copylen) { - std::memcpy(hash_state->buffer, data + copylen, md5_chunk_size); - md5_hash_step(hash_state); - copylen += md5_chunk_size; - } - - std::memcpy(hash_state->buffer, data + copylen, len - copylen); - hash_state->buffer_length = len - copylen; - } - } - } -} - -struct MD5Hash { - MD5Hash() = default; - constexpr MD5Hash(uint32_t seed) {} - - void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const - { - auto const full_length = (static_cast(hash_state->message_length)) << 3; - thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80); - - // 64 bytes are processed in each hash step - constexpr int md5_chunk_size = 64; - // 8 bytes for the total message length, appended to the end of the last chunk processed - constexpr int message_length_size = 8; - // 1 byte for the end of the message flag - constexpr int end_of_message_size = 1; - if (hash_state->buffer_length + message_length_size + end_of_message_size <= md5_chunk_size) { - thrust::fill_n( - thrust::seq, - hash_state->buffer + hash_state->buffer_length + 1, - (md5_chunk_size - message_length_size - end_of_message_size - hash_state->buffer_length), - 0x00); - } else { - thrust::fill_n(thrust::seq, - hash_state->buffer + hash_state->buffer_length + 1, - (md5_chunk_size - hash_state->buffer_length), - 0x00); - md5_hash_step(hash_state); - - thrust::fill_n(thrust::seq, hash_state->buffer, md5_chunk_size - message_length_size, 0x00); - } - - std::memcpy(hash_state->buffer + md5_chunk_size - message_length_size, - reinterpret_cast(&full_length), - message_length_size); - md5_hash_step(hash_state); - -#pragma unroll - for (int i = 0; i < 4; ++i) - uint32ToLowercaseHexString(hash_state->hash_value[i], result_location + (8 * i)); - } - - template ()>* = nullptr> - void __device__ operator()(column_device_view col, - size_type row_index, - md5_intermediate_data* hash_state) const - { - cudf_assert(false && "MD5 Unsupported chrono type column"); - } - - template ()>* = nullptr> - void __device__ operator()(column_device_view col, - size_type row_index, - md5_intermediate_data* hash_state) const - { - cudf_assert(false && "MD5 Unsupported non-fixed-width type column"); - } - - template ()>* = nullptr> - void __device__ operator()(column_device_view col, - size_type row_index, - md5_intermediate_data* hash_state) const - { - md5_process(normalize_nans_and_zeros_helper(col.element(row_index)), hash_state); - } - - template < - typename T, - std::enable_if_t() && !is_floating_point() && !is_chrono()>* = nullptr> - void CUDA_DEVICE_CALLABLE operator()(column_device_view col, - size_type row_index, - md5_intermediate_data* hash_state) const - { - md5_process(col.element(row_index), hash_state); - } -}; - -template <> -void CUDA_DEVICE_CALLABLE MD5Hash::operator()(column_device_view col, - size_type row_index, - md5_intermediate_data* hash_state) const -{ - string_view key = col.element(row_index); - uint32_t const len = static_cast(key.size_bytes()); - uint8_t const* data = reinterpret_cast(key.data()); - - hash_state->message_length += len; - - if (hash_state->buffer_length + len < 64) { - std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len); - hash_state->buffer_length += len; - } else { - uint32_t copylen = 64 - hash_state->buffer_length; - std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen); - md5_hash_step(hash_state); - - while (len > 64 + copylen) { - std::memcpy(hash_state->buffer, data + copylen, 64); - md5_hash_step(hash_state); - copylen += 64; - } - - std::memcpy(hash_state->buffer, data + copylen, len - copylen); - hash_state->buffer_length = len - copylen; - } -} - -template <> -void CUDA_DEVICE_CALLABLE MD5Hash::operator()(column_device_view col, - size_type row_index, - md5_intermediate_data* hash_state) const -{ - static constexpr size_type offsets_column_index{0}; - static constexpr size_type data_column_index{1}; - - column_device_view offsets = col.child(offsets_column_index); - column_device_view data = col.child(data_column_index); - - if (data.type().id() == type_id::LIST) cudf_assert(false && "Nested list unsupported"); - - cudf::type_dispatcher(data.type(), - MD5ListHasher{}, - data, - offsets.element(row_index), - offsets.element(row_index + 1), - hash_state); -} - } // namespace +namespace detail { + std::unique_ptr md5_hash(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (input.num_columns() == 0 || input.num_rows() == 0) { - // Return the MD5 hash of a zero-length input. const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e"); auto output = make_column_from_scalar(string_128bit, input.num_rows(), stream, mr); return output;