Skip to content

Commit

Permalink
Implement DataFrame.hash_values, deprecate DataFrame.hash_columns. (#…
Browse files Browse the repository at this point in the history
…9458)

This PR implements `DataFrame.hash_values`, which will replace `DataFrame.hash_columns` (which is deprecated in this PR). This proposal was discussed offline with @vyasr and in the weekly cuDF Python dev meeting.

This unifies the method name and signature for `Series.hash_values` and `DataFrame.hash_values`, enabling future internal refactoring by moving the method's implementation to the `Frame` class (though I'm waiting for the removal of `Series.hash_encode` to follow up on this so it can be done in a single pass, see #9381 and #9457).

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Sheilah Kirui (https://github.com/skirui-source)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: #9458
  • Loading branch information
bdice authored Oct 19, 2021
1 parent 6fa562f commit 2144034
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 9 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Serialization / IO / conversion
DataFrame.from_pandas
DataFrame.from_records
DataFrame.hash_columns
DataFrame.hash_values
DataFrame.to_arrow
DataFrame.to_dlpack
DataFrame.to_parquet
Expand Down
53 changes: 48 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4396,6 +4396,9 @@ def apply_chunks(
def hash_columns(self, columns=None, method="murmur3"):
"""Hash the given *columns* and return a new device array
This method is deprecated. Replace ``df.hash_columns(columns, method)``
with ``df[columns].hash_values(method)``.
Parameters
----------
columns : sequence of str; optional
Expand All @@ -4411,14 +4414,54 @@ def hash_columns(self, columns=None, method="murmur3"):
Series
Hash values for each row.
"""
table_to_hash = (
self
if columns is None
else Frame(data={k: self._data[k] for k in columns})
warnings.warn(
"The `hash_columns` method will be removed in a future cuDF "
"release. Replace `df.hash_columns(columns, method)` with "
"`df[columns].hash_values(method)`.",
FutureWarning,
)
if columns is None:
# Slice by [:] to keep all columns.
columns = slice(None, None, None)
return self[columns].hash_values(method=method)

def hash_values(self, method="murmur3"):
"""Compute the hash of values in each row.
Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.
Returns
-------
Series
A Series with hash values.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
>>> df
a b
0 10 0.00
1 120 0.25
2 30 0.50
>>> df.hash_values(method="murmur3")
0 -330519225
1 -397962448
2 -1345834934
dtype: int32
>>> df.hash_values(method="md5")
0 57ce879751b5169c525907d5c563fae1
1 948d6221a7c4963d4be411bcead7e32b
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
return Series._from_data(
{None: table_to_hash._hash(method=method)}, index=self.index
{None: self._hash(method=method)}, index=self.index
)

def partition_by_hash(self, columns, nparts, keep_index=True):
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3501,7 +3501,15 @@ def hash_values(self, method="murmur3"):
2 30
dtype: int64
>>> series.hash_values(method="murmur3")
array([-1930516747, 422619251, -941520876], dtype=int32)
0 -1930516747
1 422619251
2 -941520876
dtype: int32
>>> series.hash_values(method="md5")
0 7be4bbacbfdb05fb3044e36c22b41e8b
1 947ca8d2c5f0f27437f156cfbfab0969
2 d0580ef52d27c043c8e341fd5039b166
dtype: object
"""
return Series._from_data(
{None: self._hash(method=method)}, index=self.index
Expand Down
30 changes: 27 additions & 3 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,17 +1110,41 @@ def test_dataframe_hash_columns(nrows, method):
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out = gdf.hash_columns(["a", "b"])
with pytest.warns(FutureWarning):
out = gdf.hash_columns(["a", "b"])
assert isinstance(out, cudf.Series)
assert len(out) == nrows
assert out.dtype == np.int32

# Check default
out_all = gdf.hash_columns()
with pytest.warns(FutureWarning):
out_all = gdf.hash_columns()
assert_eq(out, out_all)

# Check single column
out_one = gdf.hash_columns(["a"], method=method)
with pytest.warns(FutureWarning):
out_one = gdf.hash_columns(["a"], method=method)
# First matches last
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
assert_eq(gdf["a"].hash_values(method=method), out_one)


@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
@pytest.mark.parametrize("method", ["murmur3", "md5"])
def test_dataframe_hash_values(nrows, method):
gdf = cudf.DataFrame()
data = np.asarray(range(nrows))
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out = gdf.hash_values()
assert isinstance(out, cudf.Series)
assert len(out) == nrows
assert out.dtype == np.int32

# Check single column
out_one = gdf[["a"]].hash_values(method=method)
# First matches last
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
Expand Down

0 comments on commit 2144034

Please sign in to comment.