Skip to content

Commit

Permalink
Fix passing seed parameter to MurmurHash3_32 in cudf::hash() function (
Browse files Browse the repository at this point in the history
…#12875)

Fixes passing seed parameter to `MurmurHash3_32` in `cudf::hash()` function. The `MurmurHash3_32` algorithm takes a seed value which helps provide variation in hash results. The seed parameter was not being passed to the algorithm through the `element_hasher` but only used in the `hash_combine` functions. This resulted in only small variations in the hash results. The following example illustrates:
```
{
  cudf::test::strings_column_wrapper const strings_col({"hello world"});
  auto const input1 = cudf::table_view({strings_col});
  for (int i = 0; i < 5; ++i) {
    auto output1 = cudf::hash(input1, cudf::hash_id::HASH_MURMUR3, i);
    std::cout << i << ": ";
    cudf::test::print(output1->view());
  }
}
```
The output for the 5 hashes with associated seed values:
```
seed hash
0    4241098952
1    4241099017
2    4241099082
3    4241099147
4    4241099213
```

A  `MurmurHash3_32` algorithm would produce the following variations if given the seed values:
```
seed hash
0    1586663183
1    1128525090
2    3382554948
3    1761283998
4    1862001904
```

This PR passes the seed value through to the internal algorithm. Although the results do not exactly match the standard `MurmurHash3_32` the variations are much improved:
```
seed hash
0    4241098952
1    3782960922
2    1742023551
3    120752660
4    221470638
```

This variation is important for some machine learning operations.

Some gtests have hardcoded the hash values and were updated to match the new results.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #12875
  • Loading branch information
davidwendt authored Mar 13, 2023
1 parent f216c0b commit 7bc4a7e
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 43 deletions.
9 changes: 6 additions & 3 deletions cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1711,7 +1711,10 @@ class device_row_hasher {
{
auto it = thrust::make_transform_iterator(_table.begin(), [=](auto const& column) {
return cudf::type_dispatcher<dispatch_storage_type>(
column.type(), element_hasher_adapter<hash_function>{_check_nulls}, column, row_index);
column.type(),
element_hasher_adapter<hash_function>{_check_nulls, _seed},
column,
row_index);
});

// Hash each element and combine all the hash values together
Expand All @@ -1734,8 +1737,8 @@ class device_row_hasher {
static constexpr hash_value_type NON_NULL_HASH = 0;

public:
__device__ element_hasher_adapter(Nullate check_nulls) noexcept
: _element_hasher(check_nulls), _check_nulls(check_nulls)
__device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept
: _element_hasher(check_nulls, seed), _check_nulls(check_nulls)
{
}

Expand Down
80 changes: 40 additions & 40 deletions cpp/tests/hashing/hash_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -152,16 +152,16 @@ TEST_F(HashTest, BasicList)

auto const expect_seeded = ICW{1607594268u,
1607594268u,
3658958173u,
4162508905u,
3658958173u,
2286117305u,
3271180885u,
761198477u,
761198477u,
1340178469u,
3271180885u,
3271180885u};
1576790066u,
1203671017u,
1576790066u,
2107478077u,
1756855002u,
2228938758u,
2228938758u,
3491134126u,
1756855002u,
1756855002u};

auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
Expand Down Expand Up @@ -192,14 +192,14 @@ TEST_F(HashTest, NullableList)

auto const expect_seeded = ICW{2271820643u,
2271820643u,
4263297392u,
4263297392u,
3089720935u,
1865775808u,
1865775808u,
1038318696u,
1038318696u,
595138041u,
3027840870u,
3027840870u,
2271820578u,
3089720935u,
3089720935u,
595138041u,
595138041u,
2271820578u};

auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31);
Expand Down Expand Up @@ -256,14 +256,14 @@ TEST_F(HashTest, ListOfStruct)
3642097855u,
3642097855u,
3642110391u,
3624905718u,
608933631u,
1899376347u,
1899376347u,
2058877614u,
2058877614u,
4013395891u,
4013395891u};
3889855760u,
1494406307u,
103934081u,
103934081u,
3462063680u,
3462063680u,
1696730835u,
1696730835u};

auto const seeded_output =
cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619);
Expand Down Expand Up @@ -303,19 +303,19 @@ TEST_F(HashTest, ListOfEmptyStruct)
cudf::UNKNOWN_NULL_COUNT,
std::move(list_validity_buffer));

auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{-2023148619,
-2023148619,
-2023148682,
-2023148682,
-340558283,
-340558283,
-340558283,
-1999301021,
-1999301021,
-1999301020,
-1999301020,
-340558244,
-340558244};
auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{2271818677u,
2271818677u,
2271818614u,
2271818614u,
3954409013u,
3954409013u,
3954409013u,
2295666275u,
2295666275u,
2295666276u,
2295666276u,
3954409052u,
3954409052u};

auto output = cudf::hash(cudf::table_view({*list_column}));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
Expand Down Expand Up @@ -343,7 +343,7 @@ TEST_F(HashTest, EmptyDeepList)
std::move(list_validity_buffer));

auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
-2023148619, -2023148619, -2023148682, -2023148682};
2271818677u, 2271818677u, 2271818614u, 2271818614u};

auto output = cudf::hash(cudf::table_view({*list_column}));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
Expand Down

0 comments on commit 7bc4a7e

Please sign in to comment.