Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose seed argument to hash_values #12795

Merged
merged 11 commits into from
Feb 24, 2023
24 changes: 22 additions & 2 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@ def memory_usage(self, index=True, deep=False):
"""
raise NotImplementedError

def hash_values(self, method="murmur3"):
def hash_values(self, method="murmur3", seed=None):
"""Compute the hash of values in this column.

Parameters
Expand All @@ -1639,6 +1639,12 @@ def hash_values(self, method="murmur3"):
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.

seed : int, optional
Seed value to use for the hash function.
Note - This only has effect for the following supported
hash functions:
* murmur3: MurmurHash3 hash function.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the seed has no effect, I think maybe we should warn, at least since we have some flexibility here given this has no equivalent pandas API.


Returns
-------
Series
Expand All @@ -1665,6 +1671,11 @@ def hash_values(self, method="murmur3"):
1 947ca8d2c5f0f27437f156cfbfab0969
2 d0580ef52d27c043c8e341fd5039b166
dtype: object
>>> series.hash_values(method="murmur3", seed=42)
0 2364453205
1 422621911
2 3353449140
dtype: uint32

**DataFrame**

Expand All @@ -1686,11 +1697,20 @@ def hash_values(self, method="murmur3"):
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
seed_hash_methods = {"murmur3"}
if seed is None:
seed = 0
elif method not in seed_hash_methods:
warnings.warn(
"Provided seed value has no effect for hash method"
f" `{method}`. Refer to the docstring for information"
" on hash methods that support the `seed` param"
)
# Note that both Series and DataFrame return Series objects from this
# calculation, necessitating the unfortunate circular reference to the
# child class here.
return cudf.Series._from_data(
{None: libcudf.hash.hash([*self._columns], method)},
{None: libcudf.hash.hash([*self._columns], method, seed)},
index=self.index,
)

Expand Down
39 changes: 35 additions & 4 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
NUMERIC_TYPES,
assert_eq,
assert_exceptions_equal,
assert_neq,
does_not_raise,
expect_warning_if,
gen_rand,
Expand Down Expand Up @@ -1323,9 +1324,10 @@ def test_assign():

@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
@pytest.mark.parametrize("method", ["murmur3", "md5"])
def test_dataframe_hash_values(nrows, method):
@pytest.mark.parametrize("seed", [None, 42])
def test_dataframe_hash_values(nrows, method, seed):
gdf = cudf.DataFrame()
data = np.asarray(range(nrows))
data = np.arange(nrows)
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
Expand All @@ -1334,12 +1336,41 @@ def test_dataframe_hash_values(nrows, method):
assert len(out) == nrows
assert out.dtype == np.uint32

warning_expected = (
True if seed is not None and method not in {"murmur3"} else False
)
# Check single column
out_one = gdf[["a"]].hash_values(method=method)
if warning_expected:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another alternative is to separate out the test for warning into a separate pytest and not use seed at all for this one.

with pytest.warns(
UserWarning, match="Provided seed value has no effect*"
):
out_one = gdf[["a"]].hash_values(method=method, seed=seed)
else:
out_one = gdf[["a"]].hash_values(method=method, seed=seed)
# First matches last
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
assert_eq(gdf["a"].hash_values(method=method), out_one)
if warning_expected:
with pytest.warns(
UserWarning, match="Provided seed value has no effect*"
):
assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
else:
assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)


@pytest.mark.parametrize("method", ["murmur3"])
def test_dataframe_hash_values_seed(method):
gdf = cudf.DataFrame()
data = np.arange(10)
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out_one = gdf.hash_values(method=method, seed=0)
out_two = gdf.hash_values(method=method, seed=1)
assert out_one.iloc[0] == out_one.iloc[-1]
assert out_two.iloc[0] == out_two.iloc[-1]
assert_neq(out_one, out_two)


@pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
Expand Down