diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index 52ca52eb2ff..7bfb87bdf81 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,10 +55,9 @@ TEST_F(MD5HashTest, MultiValue) cudf::test::fixed_width_column_wrapper const ints_col( {0, 100, -100, limits::min(), limits::max()}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0}); + // Test against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const md5_string_output1 = cudf::hashing::md5(string_input1); @@ -68,47 +67,23 @@ TEST_F(MD5HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output1->view(), md5_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output2->view(), md5_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const md5_output1 = cudf::hashing::md5(input1); auto const md5_output2 = cudf::hashing::md5(input2); EXPECT_EQ(input1.num_rows(), md5_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_output1->view(), md5_output2->view()); } -TEST_F(MD5HashTest, MultiValueNulls) +TEST_F(MD5HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " - "MD5 hash function. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "A 60 character string to test MD5's message padding algorithm", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null - - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::md5(input1); auto const output2 = cudf::hashing::md5(input2); @@ -117,10 +92,12 @@ TEST_F(MD5HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TEST_F(MD5HashTest, StringListsNulls) +TEST_F(MD5HashTest, StringLists) { auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); + // Test of data serialization: a string should hash the same as a list of + // strings that concatenate to the same input. cudf::test::strings_column_wrapper const strings_col( {"", "A 60 character string to test MD5's message padding algorithm", @@ -131,7 +108,7 @@ TEST_F(MD5HashTest, StringListsNulls) cudf::test::lists_column_wrapper strings_list_col( {{""}, - {{"NULL", "A 60 character string to test MD5's message padding algorithm"}, validity}, + {{"", "A 60 character string to test MD5's message padding algorithm"}, validity}, {"A very long (greater than 128 bytes/char string) to test a multi hash-step data point in " "the " "MD5 hash function. This string needed to be longer.", @@ -153,7 +130,7 @@ class MD5HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(MD5HashTestTyped, Equality) +TYPED_TEST(MD5HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -166,31 +143,26 @@ TYPED_TEST(MD5HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(MD5HashTestTyped, EqualityNulls) +TYPED_TEST(MD5HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::md5(input1); - auto const output2 = cudf::hashing::md5(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::md5(input); + auto const output2 = cudf::hashing::md5(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } TEST_F(MD5HashTest, TestBoolListsWithNulls) { - cudf::test::fixed_width_column_wrapper const col1({0, 255, 255, 16, 27, 18, 100, 1, 2}, + cudf::test::fixed_width_column_wrapper const col1({0, 0, 0, 0, 1, 1, 1, 0, 0}, {1, 0, 0, 0, 1, 1, 1, 0, 0}); - cudf::test::fixed_width_column_wrapper const col2({0, 255, 255, 32, 81, 68, 3, 101, 4}, + cudf::test::fixed_width_column_wrapper const col2({0, 0, 0, 1, 0, 1, 0, 1, 0}, {1, 0, 0, 1, 0, 1, 0, 1, 0}); - cudf::test::fixed_width_column_wrapper const col3({0, 255, 255, 64, 49, 42, 5, 6, 102}, + cudf::test::fixed_width_column_wrapper const col3({0, 0, 0, 1, 1, 0, 0, 0, 1}, {1, 0, 0, 1, 1, 0, 0, 0, 1}); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; }); @@ -218,16 +190,16 @@ TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls) { using T = TypeParam; - cudf::test::fixed_width_column_wrapper const col1({0, 255, 255, 16, 27, 18, 100, 1, 2}, + cudf::test::fixed_width_column_wrapper const col1({0, 0, 0, 0, 27, 18, 100, 0, 0}, {1, 0, 0, 0, 1, 1, 1, 0, 0}); - cudf::test::fixed_width_column_wrapper const col2({0, 255, 255, 32, 81, 68, 3, 101, 4}, + cudf::test::fixed_width_column_wrapper const col2({0, 0, 0, 32, 0, 68, 0, 101, 0}, {1, 0, 0, 1, 0, 1, 0, 1, 0}); - cudf::test::fixed_width_column_wrapper const col3({0, 255, 255, 64, 49, 42, 5, 6, 102}, + cudf::test::fixed_width_column_wrapper const col3({0, 0, 0, 64, 49, 0, 0, 0, 102}, {1, 0, 0, 1, 1, 0, 0, 0, 1}); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; }); cudf::test::lists_column_wrapper const list_col( - {{0, 0, 0}, {127}, {}, {{32, 127, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}}, + {{0, 0, 0}, {}, {}, {{32, 0, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}}, validity); auto const input1 = cudf::table_view({col1, col2, col3});