From 7bc4a7e1bc22fb1635e69d5efbf86ca5d99631de Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 13 Mar 2023 11:36:56 -0400 Subject: [PATCH] Fix passing seed parameter to MurmurHash3_32 in cudf::hash() function (#12875) Fixes passing seed parameter to `MurmurHash3_32` in `cudf::hash()` function. The `MurmurHash3_32` algorithm takes a seed value which helps provide variation in hash results. The seed parameter was not being passed to the algorithm through the `element_hasher` but only used in the `hash_combine` functions. This resulted in only small variations in the hash results. The following example illustrates: ``` { cudf::test::strings_column_wrapper const strings_col({"hello world"}); auto const input1 = cudf::table_view({strings_col}); for (int i = 0; i < 5; ++i) { auto output1 = cudf::hash(input1, cudf::hash_id::HASH_MURMUR3, i); std::cout << i << ": "; cudf::test::print(output1->view()); } } ``` The output for the 5 hashes with associated seed values: ``` seed hash 0 4241098952 1 4241099017 2 4241099082 3 4241099147 4 4241099213 ``` A `MurmurHash3_32` algorithm would produce the following variations if given the seed values: ``` seed hash 0 1586663183 1 1128525090 2 3382554948 3 1761283998 4 1862001904 ``` This PR passes the seed value through to the internal algorithm. Although the results do not exactly match the standard `MurmurHash3_32` the variations are much improved: ``` seed hash 0 4241098952 1 3782960922 2 1742023551 3 120752660 4 221470638 ``` This variation is important for some machine learning operations. Some gtests have hardcoded the hash values and were updated to match the new results. Authors: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/12875 --- .../cudf/table/experimental/row_operators.cuh | 9 ++- cpp/tests/hashing/hash_test.cpp | 80 +++++++++---------- 2 files changed, 46 insertions(+), 43 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 58f20adb923..846bebfe10b 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -1711,7 +1711,10 @@ class device_row_hasher { { auto it = thrust::make_transform_iterator(_table.begin(), [=](auto const& column) { return cudf::type_dispatcher( - column.type(), element_hasher_adapter{_check_nulls}, column, row_index); + column.type(), + element_hasher_adapter{_check_nulls, _seed}, + column, + row_index); }); // Hash each element and combine all the hash values together @@ -1734,8 +1737,8 @@ class device_row_hasher { static constexpr hash_value_type NON_NULL_HASH = 0; public: - __device__ element_hasher_adapter(Nullate check_nulls) noexcept - : _element_hasher(check_nulls), _check_nulls(check_nulls) + __device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept + : _element_hasher(check_nulls, seed), _check_nulls(check_nulls) { } diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index c1a73761e8d..bb96b50c624 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -152,16 +152,16 @@ TEST_F(HashTest, BasicList) auto const expect_seeded = ICW{1607594268u, 1607594268u, - 3658958173u, - 4162508905u, - 3658958173u, - 2286117305u, - 3271180885u, - 761198477u, - 761198477u, - 1340178469u, - 3271180885u, - 3271180885u}; + 1576790066u, + 1203671017u, + 1576790066u, + 2107478077u, + 1756855002u, + 2228938758u, + 2228938758u, + 3491134126u, + 1756855002u, + 1756855002u}; auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity); @@ -192,14 +192,14 @@ TEST_F(HashTest, NullableList) auto const expect_seeded = ICW{2271820643u, 2271820643u, - 4263297392u, - 4263297392u, - 3089720935u, - 1865775808u, - 1865775808u, + 1038318696u, + 1038318696u, + 595138041u, + 3027840870u, + 3027840870u, 2271820578u, - 3089720935u, - 3089720935u, + 595138041u, + 595138041u, 2271820578u}; auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31); @@ -256,14 +256,14 @@ TEST_F(HashTest, ListOfStruct) 3642097855u, 3642097855u, 3642110391u, - 3624905718u, - 608933631u, - 1899376347u, - 1899376347u, - 2058877614u, - 2058877614u, - 4013395891u, - 4013395891u}; + 3889855760u, + 1494406307u, + 103934081u, + 103934081u, + 3462063680u, + 3462063680u, + 1696730835u, + 1696730835u}; auto const seeded_output = cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619); @@ -303,19 +303,19 @@ TEST_F(HashTest, ListOfEmptyStruct) cudf::UNKNOWN_NULL_COUNT, std::move(list_validity_buffer)); - auto expect = cudf::test::fixed_width_column_wrapper{-2023148619, - -2023148619, - -2023148682, - -2023148682, - -340558283, - -340558283, - -340558283, - -1999301021, - -1999301021, - -1999301020, - -1999301020, - -340558244, - -340558244}; + auto expect = cudf::test::fixed_width_column_wrapper{2271818677u, + 2271818677u, + 2271818614u, + 2271818614u, + 3954409013u, + 3954409013u, + 3954409013u, + 2295666275u, + 2295666275u, + 2295666276u, + 2295666276u, + 3954409052u, + 3954409052u}; auto output = cudf::hash(cudf::table_view({*list_column})); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity); @@ -343,7 +343,7 @@ TEST_F(HashTest, EmptyDeepList) std::move(list_validity_buffer)); auto expect = cudf::test::fixed_width_column_wrapper{ - -2023148619, -2023148619, -2023148682, -2023148682}; + 2271818677u, 2271818677u, 2271818614u, 2271818614u}; auto output = cudf::hash(cudf::table_view({*list_column})); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);