From 0d3845c072d985885bbd40aabf554f8edbc47a74 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 7 Jul 2020 10:28:40 +0000 Subject: [PATCH] Initial MD5 implementation --- cpp/CMakeLists.txt | 1 + .../cudf/detail/utilities/hash_functions.cuh | 215 +++++++++++++++++- cpp/src/hash/hash_constants.cu | 96 ++++++++ cpp/src/hash/hash_constants.cuh | 22 ++ cpp/src/hash/hash_constants.hpp | 74 ++++++ cpp/src/hash/hashing.cu | 96 +++++++- cpp/tests/hashing/hash_test.cpp | 172 +++++++++++++- 7 files changed, 669 insertions(+), 7 deletions(-) create mode 100644 cpp/src/hash/hash_constants.cu create mode 100644 cpp/src/hash/hash_constants.cuh create mode 100644 cpp/src/hash/hash_constants.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3f14909ad38..4141ab2820a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -423,6 +423,7 @@ add_library(cudf src/stream_compaction/drop_duplicates.cu src/datetime/datetime_ops.cu src/hash/hashing.cu + src/hash/hash_constants.cu src/partitioning/partitioning.cu src/quantiles/quantile.cu src/quantiles/quantiles.cu diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 8f26690c835..9d145fd20c8 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, NVIDIA CORPORATION. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,221 @@ #pragma once +#include #include +#include + +#include "cuda_runtime_api.h" +#include "cudf/types.hpp" +#include "driver_types.h" +#include "vector_types.h" using hash_value_type = uint32_t; +namespace cudf { +namespace detail { + + /** + * @brief Helper function, left rotate bit value the value n bits + */ + CUDA_HOST_DEVICE_CALLABLE uint32_t left_rotate(uint32_t value, uint32_t shift) + { + return (value << shift) | (value >> (32-shift)); + } + + /** + * @brief Core MD5 algorith implementation. Processes a single 512-bit chunk, + * updating the hash value so far. Does not zero out the buffer contents. + */ + void CUDA_HOST_DEVICE_CALLABLE + md5_hash_step(md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) + { + uint32_t A = hash_state->hash_value[0]; + uint32_t B = hash_state->hash_value[1]; + uint32_t C = hash_state->hash_value[2]; + uint32_t D = hash_state->hash_value[3]; + + uint32_t* buffer_ints = (uint32_t*)hash_state->buffer; + + for(unsigned int j = 0; j < 64; j++) { + uint32_t F, g; + switch(j / 16) { + case 0 : + F = (B & C) | ((~B)&D); // D ^ (B & (C ^ D)) + g = j; + break; + case 1 : + F = (D & B) | ((~D)&C); + g = (5 * j + 1) % 16; + break; + case 2 : + F = B ^ C ^ D; + g = (3 * j + 5) % 16; + break; + case 3 : + F = C ^ (B | (~D)); + g = (7 * j) % 16; + break; + } + + F = F + A + hash_constants[j] + buffer_ints[g]; + + A = D; + D = C; + C = B; + B = B + left_rotate(F, shift_constants[((j/16) * 4) + (j%4)]); + } + + hash_state->hash_value[0] += A; + hash_state->hash_value[1] += B; + hash_state->hash_value[2] += C; + hash_state->hash_value[3] += D; + + hash_state->buffer_length = 0; + } + + template + struct MD5Hash { + using argument_type = Key; + + /** + * @brief Core MD5 element processing function + */ + template + void CUDA_HOST_DEVICE_CALLABLE + process(TKey const& key, + const uint32_t len, + md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) const + { + uint8_t* data = (uint8_t*)&key; + hash_state->message_length += len; + + if(hash_state->buffer_length + len < 64) { + thrust::copy_n(thrust::seq, data, len, hash_state->buffer+hash_state->buffer_length); + hash_state->buffer_length += len; + } else { + uint32_t copylen = 64 - hash_state->buffer_length; + + thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer+hash_state->buffer_length); + md5_hash_step(hash_state, hash_constants, shift_constants); + + while(len > 64 + copylen) { + thrust::copy_n(thrust::seq, data+copylen, 64, hash_state->buffer); + md5_hash_step(hash_state, hash_constants, shift_constants); + copylen += 64; + } + + thrust::copy_n(thrust::seq, data+copylen, len-copylen, hash_state->buffer); + hash_state->buffer_length = len-copylen; + } + } + + template ()>* = nullptr> + void CUDA_HOST_DEVICE_CALLABLE + operator()(T const& key, + md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) const + { + process(key, size_of(key), hash_state, hash_constants, shift_constants); + } + + template ()>* = nullptr> + void CUDA_HOST_DEVICE_CALLABLE + operator()(T const& key, + md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) const + { + CUDF_FAIL("Unsupported hash type"); + } + + void CUDA_HOST_DEVICE_CALLABLE + operator()(Key const& key, + md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) const {} + }; + + /** + * @brief Specialization of MD5Hash operator for strings. + */ + template <> + void CUDA_HOST_DEVICE_CALLABLE + MD5Hash::operator()(cudf::string_view const& key, + md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) const + { + const uint32_t len = (uint32_t)key.size_bytes(); + const uint8_t* data = (const uint8_t*)key.data(); + + hash_state->message_length += len; + + if(hash_state->buffer_length + len < 64) { + thrust::copy_n(thrust::seq, data, len, hash_state->buffer+hash_state->buffer_length); + hash_state->buffer_length += len; + } else { + uint32_t copylen = 64 - hash_state->buffer_length; + thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer+hash_state->buffer_length); + md5_hash_step(hash_state, hash_constants, shift_constants); + + while(len > 64 + copylen) { + thrust::copy_n(thrust::seq, data+copylen, 64, hash_state->buffer); + md5_hash_step(hash_state, hash_constants, shift_constants); + copylen += 64; + } + + thrust::copy_n(thrust::seq, data+copylen, len-copylen, hash_state->buffer); + hash_state->buffer_length = len-copylen; + } + } + + /** + * @brief Finalize MD5 hash including converstion to hex string. + */ + void CUDA_HOST_DEVICE_CALLABLE + finalize_md5_hash(md5_intermediate_data* hash_state, + char* result_location, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants, + const hex_to_char_mapping_type* hex_char_map) + { + uint64_t full_length = (uint64_t)hash_state->message_length; + full_length = full_length << 3; + thrust::fill_n(thrust::seq, hash_state->buffer+hash_state->buffer_length, 1, 0x80); + + if(hash_state->buffer_length <= 55) { + thrust::fill_n(thrust::seq, hash_state->buffer+hash_state->buffer_length+1, + (55 - hash_state->buffer_length), 0x00); + } else { + thrust::fill_n(thrust::seq, hash_state->buffer+hash_state->buffer_length+1, + (64 - hash_state->buffer_length), 0x00); + md5_hash_step(hash_state, hash_constants, shift_constants); + + thrust::fill_n(thrust::seq, hash_state->buffer, 56, 0x00); + } + + thrust::copy_n(thrust::seq, (uint8_t*)&full_length, 8, hash_state->buffer+56); + md5_hash_step(hash_state, hash_constants, shift_constants); + + u_char final_hash[32]; + uint8_t* hash_result = (uint8_t*)hash_state->hash_value; + for(int i = 0; i < 16; i++) { + final_hash[i*2] = hex_char_map[(hash_result[i] >> 4) & 0xf]; + final_hash[i*2+1] = hex_char_map[hash_result[i] & 0xf]; + } + + thrust::copy_n(thrust::seq, final_hash, 32, result_location); + } + +} // namespace detail +} // namespace cudf + // MurmurHash3_32 implementation from // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp //----------------------------------------------------------------------------- @@ -250,4 +461,4 @@ struct IdentityHash { }; template -using default_hash = MurmurHash3_32; +using default_hash = MurmurHash3_32; \ No newline at end of file diff --git a/cpp/src/hash/hash_constants.cu b/cpp/src/hash/hash_constants.cu new file mode 100644 index 00000000000..648b8b65fe3 --- /dev/null +++ b/cpp/src/hash/hash_constants.cu @@ -0,0 +1,96 @@ +/* Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "./hash_constants.hpp" + +#include + +namespace cudf { +namespace detail { + +const hex_to_char_mapping_type g_hex_to_char_mapping[] = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' +}; +const md5_shift_constants_type g_md5_shift_constants[] = { + 7, 12, 17, 22, 5, 9, 14, 20, 4, 11, 16, 23 , 6, 10, 15, 21, +}; + +const md5_hash_constants_type g_md5_hash_constants[] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, + 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, + 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, + 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, + 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, + 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, +}; + +std::mutex g_hex_to_char_mapping_mutex; +std::mutex g_md5_hash_constants_mutex; +std::mutex g_md5_shift_constants_mutex; + +hex_to_char_mapping_type* d_hex_to_char_mapping = nullptr; +md5_hash_constants_type* d_md5_hash_constants = nullptr; +md5_shift_constants_type* d_md5_shift_constants = nullptr; + +__device__ hex_to_char_mapping_type hex_to_char_mapping[sizeof(g_hex_to_char_mapping)]; +__device__ md5_hash_constants_type md5_hash_constants[sizeof(g_md5_hash_constants)]; +__device__ md5_shift_constants_type md5_shift_constants[sizeof(g_md5_shift_constants)]; + +/** + * @copydoc cudf::detail::get_hex_to_char_mapping + */ +const hex_to_char_mapping_type* get_hex_to_char_mapping() +{ + std::lock_guard guard(g_hex_to_char_mapping_mutex); + if (!d_hex_to_char_mapping) { + CUDA_TRY(cudaMemcpyToSymbol( + hex_to_char_mapping, g_hex_to_char_mapping, sizeof(g_hex_to_char_mapping))); + CUDA_TRY(cudaGetSymbolAddress((void**)&d_hex_to_char_mapping, hex_to_char_mapping)); + } + return d_hex_to_char_mapping; +} + +/** + * @copydoc cudf::detail::get_md5_hash_constants + */ +const md5_hash_constants_type* get_md5_hash_constants() +{ + std::lock_guard guard(g_md5_hash_constants_mutex); + if (!d_md5_hash_constants) { + CUDA_TRY(cudaMemcpyToSymbol( + md5_hash_constants, g_md5_hash_constants, sizeof(g_md5_hash_constants))); + CUDA_TRY(cudaGetSymbolAddress((void**)&d_md5_hash_constants, md5_hash_constants)); + } + return d_md5_hash_constants; +} + +/** + * @copydoc cudf::detail::get_md5_shift_constants + */ +const md5_shift_constants_type* get_md5_shift_constants() +{ + std::lock_guard guard(g_md5_shift_constants_mutex); + if (!d_md5_shift_constants) { + CUDA_TRY(cudaMemcpyToSymbol( + md5_shift_constants, g_md5_shift_constants, sizeof(g_md5_shift_constants))); + CUDA_TRY(cudaGetSymbolAddress((void**)&d_md5_shift_constants, md5_shift_constants)); + } + return d_md5_shift_constants; +} + +} // namespace detail +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/hash/hash_constants.cuh b/cpp/src/hash/hash_constants.cuh new file mode 100644 index 00000000000..f7cec3f0652 --- /dev/null +++ b/cpp/src/hash/hash_constants.cuh @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf { +namespace detail { + +} // namespace detail +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/hash/hash_constants.hpp b/cpp/src/hash/hash_constants.hpp new file mode 100644 index 00000000000..f6d9b195e3f --- /dev/null +++ b/cpp/src/hash/hash_constants.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf { +namespace detail { + +struct md5_intermediate_data{ + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint32_t hash_value[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476}; + uint8_t buffer[64]; +}; + +// Type for the shift constants table. +using md5_shift_constants_type = uint32_t; + +/** + * @brief Returns pointer to device memory that contains the static + * md5 shift constants table. On first call, this will copy the table into + * device memory and is guaranteed to be thread-safe. + * + * This table is used in the MD5 hash to lookup the number of bits + * to rotate left during each hash iteration. + * + * @return Device memory pointer to the MD5 shift constants table. + */ +const md5_shift_constants_type* get_md5_shift_constants(); + +// Type for the hash constants table. +using md5_hash_constants_type = uint32_t; + +/** + * @brief Returns pointer to device memory that contains the static + * md5 hash constants table. On first call, this will copy the table into + * device memory and is guaranteed to be thread-safe. + * + * This table is used in the MD5 hash to lookup values added to + * the hash during each hash iteration. + * + * @return Device memory pointer to the MD5 hash constants table. + */ +const md5_hash_constants_type* get_md5_hash_constants(); + +// Type for the hexidecimal character mapping. +using hex_to_char_mapping_type = char; + +/** + * @brief Returns pointer to device memory that contains the static + * hexidecimal character map. On first call, this will copy the table into + * device memory and is guaranteed to be thread-safe. + * + * This table is used to lookup the corresponding hex character when + * translating the hash into a hexidecimal string. + * + * @return Device memory pointer to the hex to char map. + */ +const hex_to_char_mapping_type* get_hex_to_char_mapping(); + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index fb40de7d8e1..b0acbc1aaa4 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include #include @@ -26,6 +27,7 @@ #include #include +#include #include namespace cudf { @@ -641,17 +643,103 @@ std::unique_ptr hash(table_view const& input, cudaStream_t stream) { switch(hash_function) { - // case(hash_id::HASH_IDENTITY) : - // return identity_hash(input); case(hash_id::HASH_MURMUR3) : return murmur_hash3_32(input, initial_hash, mr, stream); - // case(hash_id::HASH_MD5) : - // return md5_hash(input, mr, stream); + case(hash_id::HASH_MD5) : + return md5_hash(input, mr, stream); default : return NULL; } } +/** + * @brief Updates the MD5 hash value with an element in the given column. + * + * @tparam has_nulls Indicates the potential for null values in the column. + **/ + template + class md5_element_hasher { + public: + template + __device__ inline void operator()(column_device_view col, + size_type row_index, + md5_intermediate_data* hash_state, + const md5_hash_constants_type* hash_constants, + const md5_shift_constants_type* shift_constants) + { + if (!has_nulls || col.is_valid(row_index)) { + MD5Hash{}(col.element(row_index), hash_state, hash_constants, shift_constants); + } + } +}; + +std::unique_ptr md5_hash(table_view const& input, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) +{ + if(input.num_columns() == 0 || input.num_rows() == 0) { + const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e"); + auto output = make_column_from_scalar(string_128bit, input.num_rows(), mr, stream); + return output; + } + + // Result column allocation and creation + auto transformer = [] __device__(size_type idx) { return 32; }; + auto begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), transformer); + auto offsets_column = + cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), mr, stream); + auto offsets_view = offsets_column->view(); + auto d_new_offsets = offsets_view.data(); + + auto chars_column = + strings::detail::create_chars_child_column(input.num_rows(), 0, input.num_rows()*32, mr, stream); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + + rmm::device_buffer null_mask{0, stream, mr}; + + bool const nullable = has_nulls(input); + auto const device_input = table_device_view::create(input, stream); + + // Fetch hash constants + const md5_shift_constants_type* shift_constants = get_md5_shift_constants(); + const md5_hash_constants_type* hash_constants = get_md5_hash_constants(); + const hex_to_char_mapping_type* hex_char_map = get_hex_to_char_mapping(); + + // Hash each row, hashing each element sequentially left to right + thrust::for_each( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.num_rows()), + [d_chars, device_input = *device_input, hash_constants = hash_constants, shift_constants = shift_constants, hex_char_map = hex_char_map, has_nulls = nullable] __device__ (auto row_index) { + md5_intermediate_data hash_state; + for(int col_index = 0; col_index < device_input.num_columns(); col_index++) { + if (!has_nulls) { + cudf::type_dispatcher( + device_input.column(col_index).type(), + md5_element_hasher{}, + device_input.column(col_index), + row_index, &hash_state, hash_constants, shift_constants); + } else { + cudf::type_dispatcher( + device_input.column(col_index).type(), + md5_element_hasher{}, + device_input.column(col_index), + row_index, &hash_state, hash_constants, shift_constants); + } + } + finalize_md5_hash(&hash_state, d_chars + (row_index*32), hash_constants, shift_constants, hex_char_map); + }); + + return make_strings_column(input.num_rows(), + std::move(offsets_column), + std::move(chars_column), + 0, + std::move(null_mask), + stream, + mr); +} + std::unique_ptr murmur_hash3_32(table_view const& input, std::vector const& initial_hash, rmm::mr::device_memory_resource* mr, diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 86d62c537f7..37ffdbbd6bb 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -176,4 +176,174 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes) expect_columns_equal(output1->view(), output2->view(), true); } +class MD5HashTest : public cudf::test::BaseFixture { +}; + +TEST_F(MD5HashTest, MultiValue) +{ + strings_column_wrapper const strings_col({"", + "A 60 character string to test MD5's message padding algorithm", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the MD5 hash function. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + strings_column_wrapper const md5_string_results1({"d41d8cd98f00b204e9800998ecf8427e", + "682240021651ae166d08fe2a014d5c09", + "3669d5225fddbb34676312ca3b78bbd9", + "c61a4185135eda043f35e92c3505e180", + "52da74c75cb6575d25be29e66bd0adde"}); + + strings_column_wrapper const md5_string_results2({"d41d8cd98f00b204e9800998ecf8427e", + "e5a5682e82278e78dbaad9a689df7a73", + "4121ab1bb6e84172fd94822645862ae9", + "28970886501efe20164213855afe5850", + "6bc1b872103cc6a02d882245b8516e2e"}); + + using limits = std::numeric_limits; + fixed_width_column_wrapper const ints_col({0, 100, -100, limits::min(), limits::max()}); + + // Different truth values should be equal + fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); + fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); + + using ts = cudf::timestamp_s; + fixed_width_column_wrapper const secs_col({ts::duration::zero(), + static_cast(100), + static_cast(-100), + ts::duration::min(), + ts::duration::max()}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const md5_string_output1 = cudf::hash(string_input1, cudf::hash_id::HASH_MD5); + auto const md5_string_output2 = cudf::hash(string_input2, cudf::hash_id::HASH_MD5); + EXPECT_EQ(string_input1.num_rows(), md5_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), md5_string_output2->size()); + expect_columns_equal(md5_string_output1->view(), md5_string_results1); + expect_columns_equal(md5_string_output2->view(), md5_string_results2); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col}); + auto const md5_output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const md5_output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + EXPECT_EQ(input1.num_rows(), md5_output1->size()); + expect_columns_equal(md5_output1->view(), md5_output2->view()); +} + +TEST_F(MD5HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + strings_column_wrapper const strings_col1({"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the MD5 hash function. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + strings_column_wrapper const strings_col2({"", + "A 60 character string to test MD5's message padding algorithm", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); //empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + fixed_width_column_wrapper const ints_col1({0, 100, -100, limits::min(), limits::max()}, + {1, 0, 0, 1, 1}); + fixed_width_column_wrapper const ints_col2({0, -200, 200, limits::min(), limits::max()}, + {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truthy values should be equal + fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + // Nulls with different values should be equal + using ts = cudf::timestamp_s; + fixed_width_column_wrapper const secs_col1({ts::duration::zero(), + static_cast(100), + static_cast(-100), + ts::duration::min(), + ts::duration::max()}, + {1, 0, 0, 1, 1}); + fixed_width_column_wrapper const secs_col2({ts::duration::zero(), + static_cast(-200), + static_cast(200), + ts::duration::min(), + ts::duration::max()}, + {1, 0, 0, 1, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2}); + + auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + + EXPECT_EQ(input1.num_rows(), output1->size()); + expect_columns_equal(output1->view(), output2->view(), true); +} + +template +class MD5HashTestTyped : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(MD5HashTestTyped, cudf::test::FixedWidthTypes); + +TYPED_TEST(MD5HashTestTyped, Equality) +{ + fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hash(input, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input, cudf::hash_id::HASH_MD5); + + EXPECT_EQ(input.num_rows(), output1->size()); + expect_columns_equal(output1->view(), output2->view()); +} + +TYPED_TEST(MD5HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + + EXPECT_EQ(input1.num_rows(), output1->size()); + expect_columns_equal(output1->view(), output2->view()); +} + +template +class MD5HashTestFloatTyped : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(MD5HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(MD5HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + fixed_width_column_wrapper const col1({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + fixed_width_column_wrapper const col2({T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5); + auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5); + + expect_columns_equal(output1->view(), output2->view(), true); +} + CUDF_TEST_PROGRAM_MAIN()