diff --git a/CHANGELOG.md b/CHANGELOG.md index a5b45f842c5..c7b8c8ff1c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,7 @@ - PR #5658 Add `filter_tokens` nvtext API - PR #5666 Add `filter_characters_of_type` strings API - PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build +- PR #5438 Add MD5 hash support - PR #5704 Initial `fixed_point` Column Support - PR #5716 Add `double_type_dispatcher` to libcudf - PR #5739 Add `nvtext::detokenize` API diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index 3c83db0edba..359e6cc93bf 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,9 +37,21 @@ std::pair, std::vector> hash_partition( * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr hash(table_view const& input, + hash_id hash_function = hash_id::HASH_MURMUR3, std::vector const& initial_hash = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), cudaStream_t stream = 0); +std::unique_ptr murmur_hash3_32( + table_view const& input, + std::vector const& initial_hash = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0); + +std::unique_ptr md5_hash( + table_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 8f26690c835..da9c31286dd 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, NVIDIA CORPORATION. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,236 @@ #pragma once +#include #include +#include + +#include "cudf/types.hpp" using hash_value_type = uint32_t; +namespace cudf { +namespace detail { +/** + * Modified GPU implementation of + * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/ + * Copyright (c) 2015 Barry Clark + * Licensed under the MIT license. + * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT + */ +void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destination) +{ + // Transform 0xABCD1234 => 0x0000ABCD00001234 => 0x0B0A0D0C02010403 + uint64_t x = num; + x = ((x & 0xFFFF0000) << 16) | ((x & 0xFFFF)); + x = ((x & 0xF0000000F) << 8) | ((x & 0xF0000000F0) >> 4) | ((x & 0xF0000000F00) << 16) | + ((x & 0xF0000000F000) << 4); + + // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits + uint64_t offsets = (((x + 0x0606060606060606) >> 4) & 0x0101010101010101) * 0x27; + + x |= 0x3030303030303030; + x += offsets; + thrust::copy_n(thrust::seq, reinterpret_cast(&x), 8, destination); +} + +struct MD5Hash { + /** + * @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk, + * updating the hash value so far. Does not zero out the buffer contents. + */ + void __device__ hash_step(md5_intermediate_data* hash_state) const + { + uint32_t A = hash_state->hash_value[0]; + uint32_t B = hash_state->hash_value[1]; + uint32_t C = hash_state->hash_value[2]; + uint32_t D = hash_state->hash_value[3]; + + for (unsigned int j = 0; j < 64; j++) { + uint32_t F; + uint32_t g; + switch (j / 16) { + case 0: + F = (B & C) | ((~B) & D); + g = j; + break; + case 1: + F = (D & B) | ((~D) & C); + g = (5 * j + 1) % 16; + break; + case 2: + F = B ^ C ^ D; + g = (3 * j + 5) % 16; + break; + case 3: + F = C ^ (B | (~D)); + g = (7 * j) % 16; + break; + } + + uint32_t buffer_element_as_int; + std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4); + F = F + A + md5_hash_constants[j] + buffer_element_as_int; + A = D; + D = C; + C = B; + B = B + __funnelshift_l(F, F, md5_shift_constants[((j / 16) * 4) + (j % 4)]); + } + + hash_state->hash_value[0] += A; + hash_state->hash_value[1] += B; + hash_state->hash_value[2] += C; + hash_state->hash_value[3] += D; + + hash_state->buffer_length = 0; + } + + /** + * @brief Core MD5 element processing function + */ + template + void __device__ process(TKey const& key, md5_intermediate_data* hash_state) const + { + uint32_t const len = sizeof(TKey); + uint8_t const* data = reinterpret_cast(&key); + hash_state->message_length += len; + + // 64 bytes for the number of bytes processed in a given step + constexpr int md5_chunk_size = 64; + if (hash_state->buffer_length + len < md5_chunk_size) { + thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length); + hash_state->buffer_length += len; + } else { + uint32_t copylen = md5_chunk_size - hash_state->buffer_length; + + thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length); + hash_step(hash_state); + + while (len > md5_chunk_size + copylen) { + thrust::copy_n(thrust::seq, data + copylen, md5_chunk_size, hash_state->buffer); + hash_step(hash_state); + copylen += md5_chunk_size; + } + + thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer); + hash_state->buffer_length = len - copylen; + } + } + + void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const + { + auto const full_length = (static_cast(hash_state->message_length)) << 3; + thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80); + + // 64 bytes for the number of bytes processed in a given step + constexpr int md5_chunk_size = 64; + // 8 bytes for the total message length, appended to the end of the last chunk processed + constexpr int message_length_size = 8; + // 1 byte for the end of the message flag + constexpr int end_of_message_size = 1; + if (hash_state->buffer_length + message_length_size + end_of_message_size <= md5_chunk_size) { + thrust::fill_n( + thrust::seq, + hash_state->buffer + hash_state->buffer_length + 1, + (md5_chunk_size - message_length_size - end_of_message_size - hash_state->buffer_length), + 0x00); + } else { + thrust::fill_n(thrust::seq, + hash_state->buffer + hash_state->buffer_length + 1, + (md5_chunk_size - hash_state->buffer_length), + 0x00); + hash_step(hash_state); + + thrust::fill_n(thrust::seq, hash_state->buffer, md5_chunk_size - message_length_size, 0x00); + } + + thrust::copy_n(thrust::seq, + reinterpret_cast(&full_length), + message_length_size, + hash_state->buffer + md5_chunk_size - message_length_size); + hash_step(hash_state); + +#pragma unroll + for (int i = 0; i < 4; ++i) + uint32ToLowercaseHexString(hash_state->hash_value[i], result_location + (8 * i)); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + release_assert(false && "MD5 Unsupported chrono type column"); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + release_assert(false && "MD5 Unsupported non-fixed-width type column"); + } + + template ()>* = nullptr> + void __device__ operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + T const& key = col.element(row_index); + if (isnan(key)) { + T nan = std::numeric_limits::quiet_NaN(); + process(nan, hash_state); + } else if (key == T{0.0}) { + process(T{0.0}, hash_state); + } else { + process(key, hash_state); + } + } + + template () && !is_floating_point() && + !is_chrono()>* = nullptr> + void CUDA_DEVICE_CALLABLE operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const + { + process(col.element(row_index), hash_state); + } +}; + +template <> +void CUDA_DEVICE_CALLABLE MD5Hash::operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state) const +{ + string_view key = col.element(row_index); + uint32_t const len = static_cast(key.size_bytes()); + uint8_t const* data = reinterpret_cast(key.data()); + + hash_state->message_length += len; + + if (hash_state->buffer_length + len < 64) { + thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length); + hash_state->buffer_length += len; + } else { + uint32_t copylen = 64 - hash_state->buffer_length; + thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length); + hash_step(hash_state); + + while (len > 64 + copylen) { + thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer); + hash_step(hash_state); + copylen += 64; + } + + thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer); + hash_state->buffer_length = len - copylen; + } +} + +} // namespace detail +} // namespace cudf + // MurmurHash3_32 implementation from // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp //----------------------------------------------------------------------------- diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index cc532e377e7..d897609a1a1 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ namespace cudf { * @returns A column where each row is the hash of a column from the input */ std::unique_ptr hash(table_view const& input, + hash_id hash_function = hash_id::HASH_MURMUR3, std::vector const& initial_hash = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index f19ce096efb..596ca4840db 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -279,5 +279,14 @@ inline bool operator==(data_type const& lhs, data_type const& rhs) { return lhs. */ std::size_t size_of(data_type t); +/** + * @brief Identifies the hash function to be used + */ +enum class hash_id { + HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed + HASH_MURMUR3, ///< Murmur3 hash function + HASH_MD5 ///< MD5 hash function +}; + /** @} */ } // namespace cudf diff --git a/cpp/src/hash/hash_constants.hpp b/cpp/src/hash/hash_constants.hpp new file mode 100644 index 00000000000..0a5a9e0be93 --- /dev/null +++ b/cpp/src/hash/hash_constants.hpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf { +namespace detail { + +struct md5_intermediate_data { + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint32_t hash_value[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476}; + uint8_t buffer[64]; +}; + +// Type for the shift constants table. +using md5_shift_constants_type = uint32_t; + +__device__ __constant__ md5_shift_constants_type md5_shift_constants[16] = { + 7, + 12, + 17, + 22, + 5, + 9, + 14, + 20, + 4, + 11, + 16, + 23, + 6, + 10, + 15, + 21, +}; + +// Type for the hash constants table. +using md5_hash_constants_type = uint32_t; + +__device__ __constant__ md5_hash_constants_type md5_hash_constants[64] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, + 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, + 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, + 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, + 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, + 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, +}; +} // namespace detail +} // namespace cudf diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 5b4c581100e..1939c12dee7 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include #include @@ -24,6 +25,7 @@ #include #include #include +#include #include @@ -634,9 +636,87 @@ std::pair, std::vector> hash_partition( } std::unique_ptr hash(table_view const& input, + hash_id hash_function, std::vector const& initial_hash, rmm::mr::device_memory_resource* mr, cudaStream_t stream) +{ + switch (hash_function) { + case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, mr, stream); + case (hash_id::HASH_MD5): return md5_hash(input, mr, stream); + default: return nullptr; + } +} + +std::unique_ptr md5_hash(table_view const& input, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) +{ + if (input.num_columns() == 0 || input.num_rows() == 0) { + const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e"); + auto output = make_column_from_scalar(string_128bit, input.num_rows(), mr, stream); + return output; + } + + CUDF_EXPECTS( + std::all_of(input.begin(), + input.end(), + [](auto col) { + return !is_chrono(col.type()) && + (is_fixed_width(col.type()) || (col.type().id() == type_id::STRING)); + }), + "MD5 unsupported column type"); + + // Result column allocation and creation + auto begin = thrust::make_constant_iterator(32); + auto offsets_column = + cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), mr, stream); + auto offsets_view = offsets_column->view(); + auto d_new_offsets = offsets_view.data(); + + auto chars_column = strings::detail::create_chars_child_column( + input.num_rows(), 0, input.num_rows() * 32, mr, stream); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + + rmm::device_buffer null_mask{0, stream, mr}; + + bool const nullable = has_nulls(input); + auto const device_input = table_device_view::create(input, stream); + + // Hash each row, hashing each element sequentially left to right + thrust::for_each( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.num_rows()), + [d_chars, device_input = *device_input, has_nulls = nullable] __device__(auto row_index) { + md5_intermediate_data hash_state; + MD5Hash hasher = MD5Hash{}; + for (int col_index = 0; col_index < device_input.num_columns(); col_index++) { + if (device_input.column(col_index).is_valid(row_index)) { + cudf::type_dispatcher(device_input.column(col_index).type(), + hasher, + device_input.column(col_index), + row_index, + &hash_state); + } + } + hasher.finalize(&hash_state, d_chars + (row_index * 32)); + }); + + return make_strings_column(input.num_rows(), + std::move(offsets_column), + std::move(chars_column), + 0, + std::move(null_mask), + stream, + mr); +} + +std::unique_ptr murmur_hash3_32(table_view const& input, + std::vector const& initial_hash, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) { // TODO this should be UINT32 auto output = make_numeric_column( @@ -688,11 +768,20 @@ std::unique_ptr hash(table_view const& input, } // namespace detail std::unique_ptr hash(table_view const& input, + hash_id hash_function, std::vector const& initial_hash, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash(input, initial_hash, mr); + return detail::hash(input, hash_function, initial_hash, mr); +} + +std::unique_ptr murmur_hash3_32(table_view const& input, + std::vector const& initial_hash, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::murmur_hash3_32(input, initial_hash, mr); } } // namespace cudf diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index a5a520d7459..33736628d70 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -176,4 +176,158 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes) expect_columns_equal(output1->view(), output2->view(), true); } +class MD5HashTest : public cudf::test::BaseFixture { +}; + +TEST_F(MD5HashTest, MultiValue) +{ + strings_column_wrapper const strings_col( + {"", + "A 60 character string to test MD5's message padding algorithm", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " + "MD5 hash function. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + strings_column_wrapper const md5_string_results1({"d41d8cd98f00b204e9800998ecf8427e", + "682240021651ae166d08fe2a014d5c09", + "3669d5225fddbb34676312ca3b78bbd9", + "c61a4185135eda043f35e92c3505e180", + "52da74c75cb6575d25be29e66bd0adde"}); + + strings_column_wrapper const md5_string_results2({"d41d8cd98f00b204e9800998ecf8427e", + "e5a5682e82278e78dbaad9a689df7a73", + "4121ab1bb6e84172fd94822645862ae9", + "28970886501efe20164213855afe5850", + "6bc1b872103cc6a02d882245b8516e2e"}); + + using limits = std::numeric_limits; + fixed_width_column_wrapper const ints_col({0, 100, -100, limits::min(), limits::max()}); + + // Different truth values should be equal + fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); + fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const md5_string_output1 = cudf::hash(string_input1, cudf::hash_id::HASH_MD5); + auto const md5_string_output2 = cudf::hash(string_input2, cudf::hash_id::HASH_MD5); + EXPECT_EQ(string_input1.num_rows(), md5_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), md5_string_output2->size()); + expect_columns_equal(md5_string_output1->view(), md5_string_results1); + expect_columns_equal(md5_string_output2->view(), md5_string_results2); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + auto const md5_output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const md5_output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + EXPECT_EQ(input1.num_rows(), md5_output1->size()); + expect_columns_equal(md5_output1->view(), md5_output2->view()); +} + +TEST_F(MD5HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + strings_column_wrapper const strings_col1( + {"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " + "MD5 hash function. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + strings_column_wrapper const strings_col2( + {"", + "A 60 character string to test MD5's message padding algorithm", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); // empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + fixed_width_column_wrapper const ints_col1({0, 100, -100, limits::min(), limits::max()}, + {1, 0, 0, 1, 1}); + fixed_width_column_wrapper const ints_col2({0, -200, 200, limits::min(), limits::max()}, + {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truthy values should be equal + fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + + auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + + EXPECT_EQ(input1.num_rows(), output1->size()); + expect_columns_equal(output1->view(), output2->view()); +} + +template +class MD5HashTestTyped : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(MD5HashTestTyped, cudf::test::NumericTypes); + +TYPED_TEST(MD5HashTestTyped, Equality) +{ + fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hash(input, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input, cudf::hash_id::HASH_MD5); + + EXPECT_EQ(input.num_rows(), output1->size()); + expect_columns_equal(output1->view(), output2->view()); +} + +TYPED_TEST(MD5HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + + EXPECT_EQ(input1.num_rows(), output1->size()); + expect_columns_equal(output1->view(), output2->view()); +} + +template +class MD5HashTestFloatTyped : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(MD5HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(MD5HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + fixed_width_column_wrapper const col1({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + fixed_width_column_wrapper const col2( + {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + + expect_columns_equal(output1->view(), output2->view(), true); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index 1ee6c55cc95..6507cf67ae3 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -13,5 +13,6 @@ cimport cudf._lib.cpp.types as libcudf_types cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: cdef unique_ptr[column] hash "cudf::hash" ( const table_view& input, + const libcudf_types.hash_id& hash_function, const vector[uint32_t]& initial_hash ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index 903ab2a4be8..55458f00f8b 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -69,6 +69,11 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: LIST "cudf::type_id::LIST" NUM_TYPE_IDS "cudf::type_id::NUM_TYPE_IDS" + ctypedef enum hash_id "cudf::hash_id": + HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" + HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" + HASH_MD5 "cudf::hash_id::HASH_MD5" + cdef cppclass data_type: data_type() except + data_type(const data_type&) except + diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index a662cbac686..ade0bb2e17a 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -64,6 +64,7 @@ def hash(Table source_table, object initial_hash_values=None): c_result = move( cpp_hash( c_source_view, + libcudf_types.hash_id.HASH_MURMUR3, c_initial_hash ) )