Refactor IndexedFrame.hash_values to use cudf::hashing functions, add…

… xxhash64 to cudf Python. (#14538) This PR refactors the Python code for `IndexedFrame.hash_values` to use the newer named C++ functions from `cudf::hashing::*`. I also added bindings for xxhash64 and updated some tests. Needed for #14391. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #14538
rapidsai · Dec 12, 2023 · 3f4ca49 · 3f4ca49
1 parent 7a66e00
commit 3f4ca49
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 55 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -9,16 +9,18 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
 
-cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
+cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
 
-    ctypedef enum hash_id "cudf::hash_id":
-        HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
-        HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
-        HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
-        HASH_MD5 "cudf::hash_id::HASH_MD5"
-
-    cdef unique_ptr[column] hash "cudf::hash" (
+    cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" (
         const table_view& input,
-        const hash_id hash_function,
         const uint32_t seed
     ) except +
+
+    cdef unique_ptr[column] md5 "cudf::hashing::md5" (
+        const table_view& input
+    ) except +
+
+    cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" (
+        const table_view& input,
+        const uint64_t seed
+    ) except +
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -10,7 +10,7 @@ from libcpp.vector cimport vector
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
+from cudf._lib.cpp.hash cimport md5, murmurhash3_x86_32, xxhash_64
 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -44,20 +44,15 @@ def hash_partition(list source_columns, object columns_to_hash,
 def hash(list source_columns, str method, int seed=0):
     cdef table_view c_source_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
-    cdef cpp_hash_id c_hash_function
     if method == "murmur3":
-        c_hash_function = cpp_hash_id.HASH_MURMUR3
+        with nogil:
+            c_result = move(murmurhash3_x86_32(c_source_view, seed))
     elif method == "md5":
-        c_hash_function = cpp_hash_id.HASH_MD5
+        with nogil:
+            c_result = move(md5(c_source_view))
+    elif method == "xxhash64":
+        with nogil:
+            c_result = move(xxhash_64(c_source_view, seed))
     else:
         raise ValueError(f"Unsupported hash function: {method}")
-    with nogil:
-        c_result = move(
-            cpp_hash(
-                c_source_view,
-                c_hash_function,
-                seed
-            )
-        )
-
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -1732,16 +1732,17 @@ def hash_values(self, method="murmur3", seed=None):
 
         Parameters
         ----------
-        method : {'murmur3', 'md5'}, default 'murmur3'
+        method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
             Hash function to use:
-            * murmur3: MurmurHash3 hash function.
-            * md5: MD5 hash function.
+
+            * murmur3: MurmurHash3 hash function
+            * md5: MD5 hash function
+            * xxhash64: xxHash64 hash function
 
         seed : int, optional
-            Seed value to use for the hash function.
-            Note - This only has effect for the following supported
-            hash functions:
-            * murmur3: MurmurHash3 hash function.
+            Seed value to use for the hash function. This parameter is only
+            supported for 'murmur3' and 'xxhash64'.
+
 
         Returns
         -------
@@ -1795,14 +1796,13 @@ def hash_values(self, method="murmur3", seed=None):
         2    fe061786ea286a515b772d91b0dfcd70
         dtype: object
         """
-        seed_hash_methods = {"murmur3"}
+        seed_hash_methods = {"murmur3", "xxhash64"}
         if seed is None:
             seed = 0
         elif method not in seed_hash_methods:
             warnings.warn(
-                "Provided seed value has no effect for hash method"
-                f" `{method}`. Refer to the docstring for information"
-                " on hash methods that support the `seed` param"
+                "Provided seed value has no effect for the hash method "
+                f"`{method}`. Only {seed_hash_methods} support seeds."
             )
         # Note that both Series and DataFrame return Series objects from this
         # calculation, necessitating the unfortunate circular reference to the

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import array as arr
+import contextlib
 import datetime
 import decimal
 import io
@@ -1389,43 +1390,46 @@ def test_assign_callable(mapping):
 
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
-@pytest.mark.parametrize("method", ["murmur3", "md5"])
+@pytest.mark.parametrize("method", ["murmur3", "md5", "xxhash64"])
 @pytest.mark.parametrize("seed", [None, 42])
 def test_dataframe_hash_values(nrows, method, seed):
+    warning_expected = seed is not None and method not in {
+        "murmur3",
+        "xxhash64",
+    }
+    potential_warning = (
+        pytest.warns(UserWarning, match="Provided seed value has no effect*")
+        if warning_expected
+        else contextlib.nullcontext()
+    )
+
     gdf = cudf.DataFrame()
     data = np.arange(nrows)
     data[0] = data[-1]  # make first and last the same
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
-    out = gdf.hash_values()
+    with potential_warning:
+        out = gdf.hash_values(method=method, seed=seed)
     assert isinstance(out, cudf.Series)
     assert len(out) == nrows
-    assert out.dtype == np.uint32
+    expected_dtypes = {
+        "murmur3": np.uint32,
+        "md5": object,
+        "xxhash64": np.uint64,
+    }
+    assert out.dtype == expected_dtypes[method]
 
-    warning_expected = (
-        True if seed is not None and method not in {"murmur3"} else False
-    )
     # Check single column
-    if warning_expected:
-        with pytest.warns(
-            UserWarning, match="Provided seed value has no effect*"
-        ):
-            out_one = gdf[["a"]].hash_values(method=method, seed=seed)
-    else:
+    with potential_warning:
         out_one = gdf[["a"]].hash_values(method=method, seed=seed)
     # First matches last
     assert out_one.iloc[0] == out_one.iloc[-1]
     # Equivalent to the cudf.Series.hash_values()
-    if warning_expected:
-        with pytest.warns(
-            UserWarning, match="Provided seed value has no effect*"
-        ):
-            assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
-    else:
+    with potential_warning:
         assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
 
 
-@pytest.mark.parametrize("method", ["murmur3"])
+@pytest.mark.parametrize("method", ["murmur3", "xxhash64"])
 def test_dataframe_hash_values_seed(method):
     gdf = cudf.DataFrame()
     data = np.arange(10)
@@ -1439,6 +1443,52 @@ def test_dataframe_hash_values_seed(method):
     assert_neq(out_one, out_two)
 
 
+def test_dataframe_hash_values_xxhash64():
+    # xxhash64 has no built-in implementation in Python and we don't want to
+    # add a testing dependency, so we use regression tests against known good
+    # values.
+    gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]})
+    gdf["b"] = -gdf["a"]
+    out_a = gdf["a"].hash_values(method="xxhash64", seed=0)
+    expected_a = cudf.Series(
+        [
+            3803688792395291579,
+            10706502109028787093,
+            9835943264235290955,
+            18031741628920313605,
+            18446744073709551615,
+        ],
+        dtype=np.uint64,
+    )
+    assert_eq(out_a, expected_a)
+
+    out_b = gdf["b"].hash_values(method="xxhash64", seed=42)
+    expected_b = cudf.Series(
+        [
+            9826995235083043316,
+            10150515573749944095,
+            5005707091092326006,
+            5326262080505358431,
+            18446744073709551615,
+        ],
+        dtype=np.uint64,
+    )
+    assert_eq(out_b, expected_b)
+
+    out_df = gdf.hash_values(method="xxhash64", seed=0)
+    expected_df = cudf.Series(
+        [
+            10208049663714815266,
+            4949201786888768834,
+            18122173653994477335,
+            11133539368563441730,
+            18446744073709551615,
+        ],
+        dtype=np.uint64,
+    )
+    assert_eq(out_df, expected_df)
+
+
 @pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
 @pytest.mark.parametrize("nparts", [1, 2, 8, 13])
 @pytest.mark.parametrize("nkeys", [1, 2])