Implement DataFrame.hash_values, deprecate DataFrame.hash_columns. (#…

…9458) This PR implements `DataFrame.hash_values`, which will replace `DataFrame.hash_columns` (which is deprecated in this PR). This proposal was discussed offline with @vyasr and in the weekly cuDF Python dev meeting. This unifies the method name and signature for `Series.hash_values` and `DataFrame.hash_values`, enabling future internal refactoring by moving the method's implementation to the `Frame` class (though I'm waiting for the removal of `Series.hash_encode` to follow up on this so it can be done in a single pass, see #9381 and #9457). Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Sheilah Kirui (https://github.com/skirui-source) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: #9458
rapidsai · Oct 19, 2021 · 2144034 · 2144034
1 parent 6fa562f
commit 2144034
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 9 deletions.
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
@@ -241,6 +241,7 @@ Serialization / IO / conversion
    DataFrame.from_pandas
    DataFrame.from_records
    DataFrame.hash_columns
+   DataFrame.hash_values
    DataFrame.to_arrow
    DataFrame.to_dlpack
    DataFrame.to_parquet

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -4396,6 +4396,9 @@ def apply_chunks(
     def hash_columns(self, columns=None, method="murmur3"):
         """Hash the given *columns* and return a new device array
 
+        This method is deprecated. Replace ``df.hash_columns(columns, method)``
+        with ``df[columns].hash_values(method)``.
+
         Parameters
         ----------
         columns : sequence of str; optional
@@ -4411,14 +4414,54 @@ def hash_columns(self, columns=None, method="murmur3"):
         Series
             Hash values for each row.
         """
-        table_to_hash = (
-            self
-            if columns is None
-            else Frame(data={k: self._data[k] for k in columns})
+        warnings.warn(
+            "The `hash_columns` method will be removed in a future cuDF "
+            "release. Replace `df.hash_columns(columns, method)` with "
+            "`df[columns].hash_values(method)`.",
+            FutureWarning,
         )
+        if columns is None:
+            # Slice by [:] to keep all columns.
+            columns = slice(None, None, None)
+        return self[columns].hash_values(method=method)
+
+    def hash_values(self, method="murmur3"):
+        """Compute the hash of values in each row.
+
+        Parameters
+        ----------
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
+        Returns
+        -------
+        Series
+            A Series with hash values.
 
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
+        >>> df
+             a     b
+        0   10  0.00
+        1  120  0.25
+        2   30  0.50
+        >>> df.hash_values(method="murmur3")
+        0    -330519225
+        1    -397962448
+        2   -1345834934
+        dtype: int32
+        >>> df.hash_values(method="md5")
+        0    57ce879751b5169c525907d5c563fae1
+        1    948d6221a7c4963d4be411bcead7e32b
+        2    fe061786ea286a515b772d91b0dfcd70
+        dtype: object
+        """
         return Series._from_data(
-            {None: table_to_hash._hash(method=method)}, index=self.index
+            {None: self._hash(method=method)}, index=self.index
         )
 
     def partition_by_hash(self, columns, nparts, keep_index=True):

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -3501,7 +3501,15 @@ def hash_values(self, method="murmur3"):
         2     30
         dtype: int64
         >>> series.hash_values(method="murmur3")
-        array([-1930516747,   422619251,  -941520876], dtype=int32)
+        0   -1930516747
+        1     422619251
+        2    -941520876
+        dtype: int32
+        >>> series.hash_values(method="md5")
+        0    7be4bbacbfdb05fb3044e36c22b41e8b
+        1    947ca8d2c5f0f27437f156cfbfab0969
+        2    d0580ef52d27c043c8e341fd5039b166
+        dtype: object
         """
         return Series._from_data(
             {None: self._hash(method=method)}, index=self.index

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -1110,17 +1110,41 @@ def test_dataframe_hash_columns(nrows, method):
     data[0] = data[-1]  # make first and last the same
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
-    out = gdf.hash_columns(["a", "b"])
+    with pytest.warns(FutureWarning):
+        out = gdf.hash_columns(["a", "b"])
     assert isinstance(out, cudf.Series)
     assert len(out) == nrows
     assert out.dtype == np.int32
 
     # Check default
-    out_all = gdf.hash_columns()
+    with pytest.warns(FutureWarning):
+        out_all = gdf.hash_columns()
     assert_eq(out, out_all)
 
     # Check single column
-    out_one = gdf.hash_columns(["a"], method=method)
+    with pytest.warns(FutureWarning):
+        out_one = gdf.hash_columns(["a"], method=method)
+    # First matches last
+    assert out_one.iloc[0] == out_one.iloc[-1]
+    # Equivalent to the cudf.Series.hash_values()
+    assert_eq(gdf["a"].hash_values(method=method), out_one)
+
+
+@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
+@pytest.mark.parametrize("method", ["murmur3", "md5"])
+def test_dataframe_hash_values(nrows, method):
+    gdf = cudf.DataFrame()
+    data = np.asarray(range(nrows))
+    data[0] = data[-1]  # make first and last the same
+    gdf["a"] = data
+    gdf["b"] = gdf.a + 100
+    out = gdf.hash_values()
+    assert isinstance(out, cudf.Series)
+    assert len(out) == nrows
+    assert out.dtype == np.int32
+
+    # Check single column
+    out_one = gdf[["a"]].hash_values(method=method)
     # First matches last
     assert out_one.iloc[0] == out_one.iloc[-1]
     # Equivalent to the cudf.Series.hash_values()