From 2144034895b2282bd11e815c90f4668f1cba1756 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 19 Oct 2021 12:54:46 -0700 Subject: [PATCH] Implement DataFrame.hash_values, deprecate DataFrame.hash_columns. (#9458) This PR implements `DataFrame.hash_values`, which will replace `DataFrame.hash_columns` (which is deprecated in this PR). This proposal was discussed offline with @vyasr and in the weekly cuDF Python dev meeting. This unifies the method name and signature for `Series.hash_values` and `DataFrame.hash_values`, enabling future internal refactoring by moving the method's implementation to the `Frame` class (though I'm waiting for the removal of `Series.hash_encode` to follow up on this so it can be done in a single pass, see #9381 and #9457). Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Sheilah Kirui (https://github.com/skirui-source) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: https://github.com/rapidsai/cudf/pull/9458 --- docs/cudf/source/api_docs/dataframe.rst | 1 + python/cudf/cudf/core/dataframe.py | 53 +++++++++++++++++++++--- python/cudf/cudf/core/series.py | 10 ++++- python/cudf/cudf/tests/test_dataframe.py | 30 ++++++++++++-- 4 files changed, 85 insertions(+), 9 deletions(-) diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst index 12ff1f13bc4..e9a79701d5b 100644 --- a/docs/cudf/source/api_docs/dataframe.rst +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -241,6 +241,7 @@ Serialization / IO / conversion DataFrame.from_pandas DataFrame.from_records DataFrame.hash_columns + DataFrame.hash_values DataFrame.to_arrow DataFrame.to_dlpack DataFrame.to_parquet diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2e5e52d6eba..5d3b05e3f85 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4396,6 +4396,9 @@ def apply_chunks( def hash_columns(self, columns=None, method="murmur3"): """Hash the given *columns* and return a new device array + This method is deprecated. Replace ``df.hash_columns(columns, method)`` + with ``df[columns].hash_values(method)``. + Parameters ---------- columns : sequence of str; optional @@ -4411,14 +4414,54 @@ def hash_columns(self, columns=None, method="murmur3"): Series Hash values for each row. """ - table_to_hash = ( - self - if columns is None - else Frame(data={k: self._data[k] for k in columns}) + warnings.warn( + "The `hash_columns` method will be removed in a future cuDF " + "release. Replace `df.hash_columns(columns, method)` with " + "`df[columns].hash_values(method)`.", + FutureWarning, ) + if columns is None: + # Slice by [:] to keep all columns. + columns = slice(None, None, None) + return self[columns].hash_values(method=method) + + def hash_values(self, method="murmur3"): + """Compute the hash of values in each row. + + Parameters + ---------- + method : {'murmur3', 'md5'}, default 'murmur3' + Hash function to use: + * murmur3: MurmurHash3 hash function. + * md5: MD5 hash function. + + Returns + ------- + Series + A Series with hash values. + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]}) + >>> df + a b + 0 10 0.00 + 1 120 0.25 + 2 30 0.50 + >>> df.hash_values(method="murmur3") + 0 -330519225 + 1 -397962448 + 2 -1345834934 + dtype: int32 + >>> df.hash_values(method="md5") + 0 57ce879751b5169c525907d5c563fae1 + 1 948d6221a7c4963d4be411bcead7e32b + 2 fe061786ea286a515b772d91b0dfcd70 + dtype: object + """ return Series._from_data( - {None: table_to_hash._hash(method=method)}, index=self.index + {None: self._hash(method=method)}, index=self.index ) def partition_by_hash(self, columns, nparts, keep_index=True): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 87386b2f184..bcc97ae82ce 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3501,7 +3501,15 @@ def hash_values(self, method="murmur3"): 2 30 dtype: int64 >>> series.hash_values(method="murmur3") - array([-1930516747, 422619251, -941520876], dtype=int32) + 0 -1930516747 + 1 422619251 + 2 -941520876 + dtype: int32 + >>> series.hash_values(method="md5") + 0 7be4bbacbfdb05fb3044e36c22b41e8b + 1 947ca8d2c5f0f27437f156cfbfab0969 + 2 d0580ef52d27c043c8e341fd5039b166 + dtype: object """ return Series._from_data( {None: self._hash(method=method)}, index=self.index diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 33068e6f722..c1eade0fcdc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1110,17 +1110,41 @@ def test_dataframe_hash_columns(nrows, method): data[0] = data[-1] # make first and last the same gdf["a"] = data gdf["b"] = gdf.a + 100 - out = gdf.hash_columns(["a", "b"]) + with pytest.warns(FutureWarning): + out = gdf.hash_columns(["a", "b"]) assert isinstance(out, cudf.Series) assert len(out) == nrows assert out.dtype == np.int32 # Check default - out_all = gdf.hash_columns() + with pytest.warns(FutureWarning): + out_all = gdf.hash_columns() assert_eq(out, out_all) # Check single column - out_one = gdf.hash_columns(["a"], method=method) + with pytest.warns(FutureWarning): + out_one = gdf.hash_columns(["a"], method=method) + # First matches last + assert out_one.iloc[0] == out_one.iloc[-1] + # Equivalent to the cudf.Series.hash_values() + assert_eq(gdf["a"].hash_values(method=method), out_one) + + +@pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) +@pytest.mark.parametrize("method", ["murmur3", "md5"]) +def test_dataframe_hash_values(nrows, method): + gdf = cudf.DataFrame() + data = np.asarray(range(nrows)) + data[0] = data[-1] # make first and last the same + gdf["a"] = data + gdf["b"] = gdf.a + 100 + out = gdf.hash_values() + assert isinstance(out, cudf.Series) + assert len(out) == nrows + assert out.dtype == np.int32 + + # Check single column + out_one = gdf[["a"]].hash_values(method=method) # First matches last assert out_one.iloc[0] == out_one.iloc[-1] # Equivalent to the cudf.Series.hash_values()