From 46ced7adbbd8f62f095389f746cfc427e7e6d1ef Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 1 Mar 2023 22:28:26 -0500 Subject: [PATCH 1/2] Fix passing seed parameter to MurmurhHash3_32 in cudf::hash() function --- .../cudf/table/experimental/row_operators.cuh | 9 +- cpp/tests/hashing/hash_test.cpp | 90 +++++++++++-------- 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 2a207d2a5c4..983ce648a25 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -1713,7 +1713,10 @@ class device_row_hasher { { auto it = thrust::make_transform_iterator(_table.begin(), [=](auto const& column) { return cudf::type_dispatcher( - column.type(), element_hasher_adapter{_check_nulls}, column, row_index); + column.type(), + element_hasher_adapter{_check_nulls, _seed}, + column, + row_index); }); // Hash each element and combine all the hash values together @@ -1736,8 +1739,8 @@ class device_row_hasher { static constexpr hash_value_type NON_NULL_HASH = 0; public: - __device__ element_hasher_adapter(Nullate check_nulls) noexcept - : _element_hasher(check_nulls), _check_nulls(check_nulls) + __device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept + : _element_hasher(check_nulls, seed), _check_nulls(check_nulls) { } diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index c1a73761e8d..18098cd6dc2 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -64,6 +64,18 @@ TEST_F(HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } +TEST_F(HashTest, StringVariance) +{ + cudf::test::strings_column_wrapper const strings_col({"hello world"}); + + auto const input1 = cudf::table_view({strings_col}); + for (int i = 0; i < 5; ++i) { + auto output1 = cudf::hash(input1, cudf::hash_id::HASH_MURMUR3, i); + std::cout << i << ": "; + cudf::test::print(output1->view()); + } +} + TEST_F(HashTest, MultiValueNulls) { // Nulls with different values should be equal @@ -152,16 +164,16 @@ TEST_F(HashTest, BasicList) auto const expect_seeded = ICW{1607594268u, 1607594268u, - 3658958173u, - 4162508905u, - 3658958173u, - 2286117305u, - 3271180885u, - 761198477u, - 761198477u, - 1340178469u, - 3271180885u, - 3271180885u}; + 1576790066u, + 1203671017u, + 1576790066u, + 2107478077u, + 1756855002u, + 2228938758u, + 2228938758u, + 3491134126u, + 1756855002u, + 1756855002u}; auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity); @@ -192,14 +204,14 @@ TEST_F(HashTest, NullableList) auto const expect_seeded = ICW{2271820643u, 2271820643u, - 4263297392u, - 4263297392u, - 3089720935u, - 1865775808u, - 1865775808u, + 1038318696u, + 1038318696u, + 595138041u, + 3027840870u, + 3027840870u, 2271820578u, - 3089720935u, - 3089720935u, + 595138041u, + 595138041u, 2271820578u}; auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31); @@ -256,14 +268,14 @@ TEST_F(HashTest, ListOfStruct) 3642097855u, 3642097855u, 3642110391u, - 3624905718u, - 608933631u, - 1899376347u, - 1899376347u, - 2058877614u, - 2058877614u, - 4013395891u, - 4013395891u}; + 3889855760u, + 1494406307u, + 103934081u, + 103934081u, + 3462063680u, + 3462063680u, + 1696730835u, + 1696730835u}; auto const seeded_output = cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619); @@ -303,19 +315,19 @@ TEST_F(HashTest, ListOfEmptyStruct) cudf::UNKNOWN_NULL_COUNT, std::move(list_validity_buffer)); - auto expect = cudf::test::fixed_width_column_wrapper{-2023148619, - -2023148619, - -2023148682, - -2023148682, - -340558283, - -340558283, - -340558283, - -1999301021, - -1999301021, - -1999301020, - -1999301020, - -340558244, - -340558244}; + auto expect = cudf::test::fixed_width_column_wrapper{2271818677u, + 2271818677u, + 2271818614u, + 2271818614u, + 3954409013u, + 3954409013u, + 3954409013u, + 2295666275u, + 2295666275u, + 2295666276u, + 2295666276u, + 3954409052u, + 3954409052u}; auto output = cudf::hash(cudf::table_view({*list_column})); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity); @@ -343,7 +355,7 @@ TEST_F(HashTest, EmptyDeepList) std::move(list_validity_buffer)); auto expect = cudf::test::fixed_width_column_wrapper{ - -2023148619, -2023148619, -2023148682, -2023148682}; + 2271818677u, 2271818677u, 2271818614u, 2271818614u}; auto output = cudf::hash(cudf::table_view({*list_column})); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity); From 7d7d35b920ee0b9e6c2cac755dea3d08e6ba69e6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 1 Mar 2023 22:52:25 -0500 Subject: [PATCH 2/2] remove temp gtest --- cpp/tests/hashing/hash_test.cpp | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 18098cd6dc2..bb96b50c624 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,18 +64,6 @@ TEST_F(HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TEST_F(HashTest, StringVariance) -{ - cudf::test::strings_column_wrapper const strings_col({"hello world"}); - - auto const input1 = cudf::table_view({strings_col}); - for (int i = 0; i < 5; ++i) { - auto output1 = cudf::hash(input1, cudf::hash_id::HASH_MURMUR3, i); - std::cout << i << ": "; - cudf::test::print(output1->view()); - } -} - TEST_F(HashTest, MultiValueNulls) { // Nulls with different values should be equal