From 174c8285d117c498dfb6dfdefe36ac9b12a9f93d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 4 Oct 2021 11:51:42 -0700
Subject: [PATCH 01/12] Add MD5 API to Python.

---
 python/cudf/cudf/core/dataframe.py | 14 +++++++++++++-
 python/cudf/cudf/core/series.py    | 17 ++++++++++++-----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6d41d90ab47..15f48d7499f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5003,7 +5003,7 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    def hash_columns(self, columns=None):
+    def hash_columns(self, columns=None, method="murmur3"):
         """Hash the given *columns* and return a new device array
 
         Parameters
@@ -5011,7 +5011,19 @@ def hash_columns(self, columns=None):
         columns : sequence of str; optional
             Sequence of column names. If columns is *None* (unspecified),
             all columns in the frame are used.
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
+        Returns
+        -------
+        Series
+            Hash values for each row.
         """
+        if method not in {"murmur3", "md5"}:
+            raise ValueError(f"Unsupported hash function: {method}")
+
         if columns is None:
             table_to_hash = self
         else:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8188290c392..5b0c1e84552 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4095,13 +4095,20 @@ def floor(self):
         """
         return self._unaryop("floor")
 
-    def hash_values(self):
+    def hash_values(self, method="murmur3"):
         """Compute the hash of values in this column.
 
+        Parameters
+        ----------
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
         Returns
         -------
-        cupy array
-            A cupy array with hash values.
+        Series
+            A Series with hash values.
 
         Examples
         --------
@@ -4112,10 +4119,10 @@ def hash_values(self):
         1    120
         2     30
         dtype: int64
-        >>> series.hash_values()
+        >>> series.hash_values(method="murmur3")
         array([-1930516747,   422619251,  -941520876], dtype=int32)
         """
-        return Series(self._hash()).values
+        return Series(self._hash())
 
     def hash_encode(self, stop, use_name=False):
         """Encode column values as ints in [0, stop) using hash function.

From 873d8e4a4ef60638a1d9f9aaf04c1dc5b952ad7d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 4 Oct 2021 11:58:33 -0700
Subject: [PATCH 02/12] Return Series.

---
 python/cudf/cudf/core/dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 15f48d7499f..1869f84114c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5030,7 +5030,7 @@ def hash_columns(self, columns=None, method="murmur3"):
             cols = [self[k]._column for k in columns]
             table_to_hash = Frame(data=dict(zip(columns, cols)))
 
-        return Series(table_to_hash._hash()).values
+        return Series(table_to_hash._hash())
 
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.

From 6525b7e73078aacf57b3fec83b7053afa468be0e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Oct 2021 10:19:17 -0700
Subject: [PATCH 03/12] Add hash method parameter.

---
 python/cudf/cudf/_lib/hash.pyx  | 11 ++++++++---
 python/cudf/cudf/core/frame.py  |  4 ++--
 python/cudf/cudf/core/series.py |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index ae0116e635b..b50dd0fa804 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -54,17 +54,22 @@ def hash_partition(source_table, object columns_to_hash,
     )
 
 
-def hash(source_table, object initial_hash_values=None, int seed=0):
+def hash(source_table, str method, object initial_hash_values=None,
+         int seed=0):
     cdef vector[uint32_t] c_initial_hash = initial_hash_values or []
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
-
     cdef unique_ptr[column] c_result
+    cdef libcudf_types.hash_id c_hash_id
+    if method == "murmur3":
+        c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
+    else:
+        raise ValueError(f"Unsupported hash function: {method}")
     with nogil:
         c_result = move(
             cpp_hash(
                 c_source_view,
-                libcudf_types.hash_id.HASH_MURMUR3,
+                c_hash_function,
                 c_initial_hash,
                 seed
             )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 12fe7f313eb..463dc16b840 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False):
             result._index.names = self._index.names
         return result
 
-    def _hash(self, initial_hash_values=None):
-        return libcudf.hash.hash(self, initial_hash_values)
+    def _hash(self, method, initial_hash_values=None):
+        return libcudf.hash.hash(self, method, initial_hash_values)
 
     def _hash_partition(
         self, columns_to_hash, num_partitions, keep_index=True
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5b0c1e84552..951c7756537 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4122,7 +4122,7 @@ def hash_values(self, method="murmur3"):
         >>> series.hash_values(method="murmur3")
         array([-1930516747,   422619251,  -941520876], dtype=int32)
         """
-        return Series(self._hash())
+        return Series(self._hash(method=method))
 
     def hash_encode(self, stop, use_name=False):
         """Encode column values as ints in [0, stop) using hash function.

From 10354821eca639b067aab1af86c4d4b2b54de9f2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Oct 2021 11:12:54 -0700
Subject: [PATCH 04/12] Add MD5 API.

---
 python/cudf/cudf/_lib/hash.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index b50dd0fa804..6892831620f 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -63,6 +63,8 @@ def hash(source_table, str method, object initial_hash_values=None,
     cdef libcudf_types.hash_id c_hash_id
     if method == "murmur3":
         c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
+    elif method == "md5":
+        c_hash_function = libcudf_types.hash_id.HASH_MD5
     else:
         raise ValueError(f"Unsupported hash function: {method}")
     with nogil:

From 6a70c54332fca180b17d3cb5503696e39ffd3fdd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Oct 2021 11:56:42 -0700
Subject: [PATCH 05/12] Update hash API internals.

---
 python/cudf/cudf/_lib/hash.pyx           |  4 ++--
 python/cudf/cudf/core/dataframe.py       |  5 +----
 python/cudf/cudf/core/frame.py           |  4 ++--
 python/cudf/cudf/core/series.py          |  4 +++-
 python/cudf/cudf/tests/test_dataframe.py | 13 +++++++------
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 6892831620f..dd25e01a9e7 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -54,9 +54,9 @@ def hash_partition(source_table, object columns_to_hash,
     )
 
 
-def hash(source_table, str method, object initial_hash_values=None,
+def hash(source_table, str method, object initial_hash=None,
          int seed=0):
-    cdef vector[uint32_t] c_initial_hash = initial_hash_values or []
+    cdef vector[uint32_t] c_initial_hash = initial_hash or []
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1869f84114c..9fb29309672 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5021,16 +5021,13 @@ def hash_columns(self, columns=None, method="murmur3"):
         Series
             Hash values for each row.
         """
-        if method not in {"murmur3", "md5"}:
-            raise ValueError(f"Unsupported hash function: {method}")
-
         if columns is None:
             table_to_hash = self
         else:
             cols = [self[k]._column for k in columns]
             table_to_hash = Frame(data=dict(zip(columns, cols)))
 
-        return Series(table_to_hash._hash())
+        return Series(table_to_hash._hash(method=method))
 
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 463dc16b840..581ae6a5204 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False):
             result._index.names = self._index.names
         return result
 
-    def _hash(self, method, initial_hash_values=None):
-        return libcudf.hash.hash(self, method, initial_hash_values)
+    def _hash(self, method, initial_hash=None):
+        return libcudf.hash.hash(self, method, initial_hash)
 
     def _hash_partition(
         self, columns_to_hash, num_partitions, keep_index=True
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 951c7756537..554f7535c30 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4165,7 +4165,9 @@ def hash_encode(self, stop, use_name=False):
             raise ValueError("stop must be a positive integer.")
 
         initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
-        hashed_values = Series(self._hash(initial_hash))
+        hashed_values = Series(
+            self._hash(method="murmur3", initial_hash=initial_hash)
+        )
 
         if hashed_values.has_nulls:
             raise ValueError("Column must have no nulls.")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 5a839507182..b42bfa6d2e2 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1103,27 +1103,28 @@ def test_assign():
 
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
-def test_dataframe_hash_columns(nrows):
+@pytest.mark.parametrize("method", ["murmur3", "md5"])
+def test_dataframe_hash_columns(nrows, method):
     gdf = cudf.DataFrame()
     data = np.asarray(range(nrows))
     data[0] = data[-1]  # make first and last the same
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
     out = gdf.hash_columns(["a", "b"])
-    assert isinstance(out, cupy.ndarray)
+    # assert isinstance(out, cupy.ndarray)
     assert len(out) == nrows
     assert out.dtype == np.int32
 
     # Check default
     out_all = gdf.hash_columns()
-    np.testing.assert_array_equal(cupy.asnumpy(out), cupy.asnumpy(out_all))
+    assert_eq(out, out_all)
 
     # Check single column
-    out_one = cupy.asnumpy(gdf.hash_columns(["a"]))
+    out_one = gdf.hash_columns(["a"], method=method)
     # First matches last
-    assert out_one[0] == out_one[-1]
+    assert out_one.iloc[0] == out_one.iloc[-1]
     # Equivalent to the cudf.Series.hash_values()
-    np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one)
+    assert_eq(gdf["a"].hash_values(method=method), out_one)
 
 
 @pytest.mark.parametrize("nrows", [3, 10, 100, 1000])

From eda11339ca1b63baefa4ee171b3ad4513ed250b2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Oct 2021 12:05:12 -0700
Subject: [PATCH 06/12] Fix variable name.

---
 python/cudf/cudf/_lib/hash.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index dd25e01a9e7..f15052f33e9 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -60,7 +60,7 @@ def hash(source_table, str method, object initial_hash=None,
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
     cdef unique_ptr[column] c_result
-    cdef libcudf_types.hash_id c_hash_id
+    cdef libcudf_types.hash_id c_hash_function
     if method == "murmur3":
         c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
     elif method == "md5":

From 2c60587884c35338863bea673fb25ee693f6ae7d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Oct 2021 12:09:29 -0700
Subject: [PATCH 07/12] Check hash results are a Series.

---
 python/cudf/cudf/tests/test_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b42bfa6d2e2..5a9fcb615e5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1111,7 +1111,7 @@ def test_dataframe_hash_columns(nrows, method):
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
     out = gdf.hash_columns(["a", "b"])
-    # assert isinstance(out, cupy.ndarray)
+    assert isinstance(out, cudf.Series)
     assert len(out) == nrows
     assert out.dtype == np.int32
 

From 8962cf62278c4f7692bec93ab73b6a0f61db29c8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Oct 2021 12:19:41 -0700
Subject: [PATCH 08/12] Wrap line.

---
 python/cudf/cudf/_lib/hash.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index f15052f33e9..9b34a049cac 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -54,8 +54,7 @@ def hash_partition(source_table, object columns_to_hash,
     )
 
 
-def hash(source_table, str method, object initial_hash=None,
-         int seed=0):
+def hash(source_table, str method, object initial_hash=None, int seed=0):
     cdef vector[uint32_t] c_initial_hash = initial_hash or []
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)

From bce219be88fed9afcbc82e83b51ba0c3323c4808 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 7 Oct 2021 07:40:05 -0700
Subject: [PATCH 09/12] Use Series._from_data.

---
 python/cudf/cudf/core/dataframe.py | 4 +++-
 python/cudf/cudf/core/series.py    | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 9fb29309672..7c706fda177 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5027,7 +5027,9 @@ def hash_columns(self, columns=None, method="murmur3"):
             cols = [self[k]._column for k in columns]
             table_to_hash = Frame(data=dict(zip(columns, cols)))
 
-        return Series(table_to_hash._hash(method=method))
+        return Series._from_data(
+            {None: table_to_hash._hash(method=method)}, index=self.index
+        )
 
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 554f7535c30..f4352147f45 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4122,7 +4122,9 @@ def hash_values(self, method="murmur3"):
         >>> series.hash_values(method="murmur3")
         array([-1930516747,   422619251,  -941520876], dtype=int32)
         """
-        return Series(self._hash(method=method))
+        return Series._from_data(
+            {None: self._hash(method=method)}, index=self.index
+        )
 
     def hash_encode(self, stop, use_name=False):
         """Encode column values as ints in [0, stop) using hash function.

From c012707597fca1ff07b619daf900cd1078414029 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 7 Oct 2021 07:43:41 -0700
Subject: [PATCH 10/12] Use self._data, simplify expression.

---
 python/cudf/cudf/core/dataframe.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7c706fda177..0b4f77d4f93 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5021,11 +5021,11 @@ def hash_columns(self, columns=None, method="murmur3"):
         Series
             Hash values for each row.
         """
-        if columns is None:
-            table_to_hash = self
-        else:
-            cols = [self[k]._column for k in columns]
-            table_to_hash = Frame(data=dict(zip(columns, cols)))
+        table_to_hash = (
+            self
+            if columns is None
+            else Frame(data={k: self._data[k] for k in columns})
+        )
 
         return Series._from_data(
             {None: table_to_hash._hash(method=method)}, index=self.index

From 0bd1a562214cbc7c9deb8e6112c0e7b379e7d3e7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 7 Oct 2021 08:06:17 -0700
Subject: [PATCH 11/12] Use Series._from_data in hash_encode.

---
 python/cudf/cudf/core/series.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f4352147f45..1cd45706a1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4167,15 +4167,19 @@ def hash_encode(self, stop, use_name=False):
             raise ValueError("stop must be a positive integer.")
 
         initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
-        hashed_values = Series(
-            self._hash(method="murmur3", initial_hash=initial_hash)
+        hashed_values = Series._from_data(
+            {
+                self.name: self._hash(
+                    method="murmur3", initial_hash=initial_hash
+                )
+            },
+            self.index,
         )
 
         if hashed_values.has_nulls:
             raise ValueError("Column must have no nulls.")
 
-        mod_vals = hashed_values % stop
-        return Series(mod_vals._column, index=self.index, name=self.name)
+        return hashed_values % stop
 
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True

From 29de972590c5561ae6a16c76e93ce3e3ddecc9a3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 7 Oct 2021 19:16:44 -0700
Subject: [PATCH 12/12] Add tests for MD5 Python API.

---
 python/cudf/cudf/tests/test_series.py | 44 +++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ca179703864..cd18177fcf0 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1272,3 +1272,47 @@ def test_series_sort_index(
         assert_eq(ps, gs, check_index_type=True)
     else:
         assert_eq(expected, got, check_index_type=True)
+
+
+@pytest.mark.parametrize(
+    "method,validation_data",
+    [
+        (
+            "md5",
+            [
+                "d41d8cd98f00b204e9800998ecf8427e",
+                "cfcd208495d565ef66e7dff9f98764da",
+                "3d3aaae21d57b101227f0384f644abe0",
+                "3e76c7023d771ad1c1520c27ab3d4874",
+                "f8d805e33ec3ade1a6ea251ac1c118e7",
+                "c30515f66a5aec7af7666abf33600c92",
+                "c61a4185135eda043f35e92c3505e180",
+                "52da74c75cb6575d25be29e66bd0adde",
+                "5152ac13bdd09110d9ee9c169a3d9237",
+                "f1d3ff8443297732862df21dc4e57262",
+            ],
+        )
+    ],
+)
+def test_series_hash_values(method, validation_data):
+    inputs = cudf.Series(
+        [
+            "",
+            "0",
+            "A 56 character string to test message padding algorithm.",
+            "A 63 character string to test message padding algorithm, again.",
+            "A 64 character string to test message padding algorithm, again!!",
+            (
+                "A very long (greater than 128 bytes/char string) to execute "
+                "a multi hash-step data point in the hash function being "
+                "tested. This string needed to be longer."
+            ),
+            "All work and no play makes Jack a dull boy",
+            "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+            "\x00\x00\x00\x10\x00\x00\x00\x00",
+            "\x00\x00\x00\x00",
+        ]
+    )
+    validation_results = cudf.Series(validation_data)
+    hash_values = inputs.hash_values(method=method)
+    assert_eq(hash_values, validation_results)