Skip to content

Commit

Permalink
Refactor IndexedFrame.hash_values to use cudf::hashing functions, add…
Browse files Browse the repository at this point in the history
… xxhash64 to cudf Python. (#14538)

This PR refactors the Python code for `IndexedFrame.hash_values` to use the newer named C++ functions from `cudf::hashing::*`. I also added bindings for xxhash64 and updated some tests.

Needed for #14391.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14538
  • Loading branch information
bdice authored Dec 12, 2023
1 parent 7a66e00 commit 3f4ca49
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 55 deletions.
24 changes: 13 additions & 11 deletions python/cudf/cudf/_lib/cpp/hash.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t
from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector

Expand All @@ -9,16 +9,18 @@ from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view


cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:

ctypedef enum hash_id "cudf::hash_id":
HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
HASH_MD5 "cudf::hash_id::HASH_MD5"

cdef unique_ptr[column] hash "cudf::hash" (
cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" (
const table_view& input,
const hash_id hash_function,
const uint32_t seed
) except +

cdef unique_ptr[column] md5 "cudf::hashing::md5" (
const table_view& input
) except +

cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" (
const table_view& input,
const uint64_t seed
) except +
23 changes: 9 additions & 14 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

Expand All @@ -10,7 +10,7 @@ from libcpp.vector cimport vector
cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
from cudf._lib.cpp.hash cimport md5, murmurhash3_x86_32, xxhash_64
from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
Expand Down Expand Up @@ -44,20 +44,15 @@ def hash_partition(list source_columns, object columns_to_hash,
def hash(list source_columns, str method, int seed=0):
cdef table_view c_source_view = table_view_from_columns(source_columns)
cdef unique_ptr[column] c_result
cdef cpp_hash_id c_hash_function
if method == "murmur3":
c_hash_function = cpp_hash_id.HASH_MURMUR3
with nogil:
c_result = move(murmurhash3_x86_32(c_source_view, seed))
elif method == "md5":
c_hash_function = cpp_hash_id.HASH_MD5
with nogil:
c_result = move(md5(c_source_view))
elif method == "xxhash64":
with nogil:
c_result = move(xxhash_64(c_source_view, seed))
else:
raise ValueError(f"Unsupported hash function: {method}")
with nogil:
c_result = move(
cpp_hash(
c_source_view,
c_hash_function,
seed
)
)

return Column.from_unique_ptr(move(c_result))
22 changes: 11 additions & 11 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1732,16 +1732,17 @@ def hash_values(self, method="murmur3", seed=None):
Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.
* murmur3: MurmurHash3 hash function
* md5: MD5 hash function
* xxhash64: xxHash64 hash function
seed : int, optional
Seed value to use for the hash function.
Note - This only has effect for the following supported
hash functions:
* murmur3: MurmurHash3 hash function.
Seed value to use for the hash function. This parameter is only
supported for 'murmur3' and 'xxhash64'.
Returns
-------
Expand Down Expand Up @@ -1795,14 +1796,13 @@ def hash_values(self, method="murmur3", seed=None):
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
seed_hash_methods = {"murmur3"}
seed_hash_methods = {"murmur3", "xxhash64"}
if seed is None:
seed = 0
elif method not in seed_hash_methods:
warnings.warn(
"Provided seed value has no effect for hash method"
f" `{method}`. Refer to the docstring for information"
" on hash methods that support the `seed` param"
"Provided seed value has no effect for the hash method "
f"`{method}`. Only {seed_hash_methods} support seeds."
)
# Note that both Series and DataFrame return Series objects from this
# calculation, necessitating the unfortunate circular reference to the
Expand Down
88 changes: 69 additions & 19 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

import array as arr
import contextlib
import datetime
import decimal
import io
Expand Down Expand Up @@ -1389,43 +1390,46 @@ def test_assign_callable(mapping):


@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
@pytest.mark.parametrize("method", ["murmur3", "md5"])
@pytest.mark.parametrize("method", ["murmur3", "md5", "xxhash64"])
@pytest.mark.parametrize("seed", [None, 42])
def test_dataframe_hash_values(nrows, method, seed):
warning_expected = seed is not None and method not in {
"murmur3",
"xxhash64",
}
potential_warning = (
pytest.warns(UserWarning, match="Provided seed value has no effect*")
if warning_expected
else contextlib.nullcontext()
)

gdf = cudf.DataFrame()
data = np.arange(nrows)
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out = gdf.hash_values()
with potential_warning:
out = gdf.hash_values(method=method, seed=seed)
assert isinstance(out, cudf.Series)
assert len(out) == nrows
assert out.dtype == np.uint32
expected_dtypes = {
"murmur3": np.uint32,
"md5": object,
"xxhash64": np.uint64,
}
assert out.dtype == expected_dtypes[method]

warning_expected = (
True if seed is not None and method not in {"murmur3"} else False
)
# Check single column
if warning_expected:
with pytest.warns(
UserWarning, match="Provided seed value has no effect*"
):
out_one = gdf[["a"]].hash_values(method=method, seed=seed)
else:
with potential_warning:
out_one = gdf[["a"]].hash_values(method=method, seed=seed)
# First matches last
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
if warning_expected:
with pytest.warns(
UserWarning, match="Provided seed value has no effect*"
):
assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
else:
with potential_warning:
assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)


@pytest.mark.parametrize("method", ["murmur3"])
@pytest.mark.parametrize("method", ["murmur3", "xxhash64"])
def test_dataframe_hash_values_seed(method):
gdf = cudf.DataFrame()
data = np.arange(10)
Expand All @@ -1439,6 +1443,52 @@ def test_dataframe_hash_values_seed(method):
assert_neq(out_one, out_two)


def test_dataframe_hash_values_xxhash64():
# xxhash64 has no built-in implementation in Python and we don't want to
# add a testing dependency, so we use regression tests against known good
# values.
gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]})
gdf["b"] = -gdf["a"]
out_a = gdf["a"].hash_values(method="xxhash64", seed=0)
expected_a = cudf.Series(
[
3803688792395291579,
10706502109028787093,
9835943264235290955,
18031741628920313605,
18446744073709551615,
],
dtype=np.uint64,
)
assert_eq(out_a, expected_a)

out_b = gdf["b"].hash_values(method="xxhash64", seed=42)
expected_b = cudf.Series(
[
9826995235083043316,
10150515573749944095,
5005707091092326006,
5326262080505358431,
18446744073709551615,
],
dtype=np.uint64,
)
assert_eq(out_b, expected_b)

out_df = gdf.hash_values(method="xxhash64", seed=0)
expected_df = cudf.Series(
[
10208049663714815266,
4949201786888768834,
18122173653994477335,
11133539368563441730,
18446744073709551615,
],
dtype=np.uint64,
)
assert_eq(out_df, expected_df)


@pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
@pytest.mark.parametrize("nparts", [1, 2, 8, 13])
@pytest.mark.parametrize("nkeys", [1, 2])
Expand Down

0 comments on commit 3f4ca49

Please sign in to comment.