Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor IndexedFrame.hash_values to use cudf::hashing functions, add xxhash64 to cudf Python. #14538

Merged
merged 5 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions python/cudf/cudf/_lib/cpp/hash.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t
from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector

Expand All @@ -9,16 +9,18 @@ from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view


cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:

ctypedef enum hash_id "cudf::hash_id":
HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
HASH_MD5 "cudf::hash_id::HASH_MD5"

cdef unique_ptr[column] hash "cudf::hash" (
cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" (
const table_view& input,
const hash_id hash_function,
const uint32_t seed
) except +

cdef unique_ptr[column] md5 "cudf::hashing::md5" (
const table_view& input
) except +

cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" (
const table_view& input,
const uint64_t seed
) except +
23 changes: 9 additions & 14 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

Expand All @@ -10,7 +10,7 @@ from libcpp.vector cimport vector
cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
from cudf._lib.cpp.hash cimport md5, murmurhash3_x86_32, xxhash_64
from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
Expand Down Expand Up @@ -44,20 +44,15 @@ def hash_partition(list source_columns, object columns_to_hash,
def hash(list source_columns, str method, int seed=0):
cdef table_view c_source_view = table_view_from_columns(source_columns)
cdef unique_ptr[column] c_result
cdef cpp_hash_id c_hash_function
if method == "murmur3":
c_hash_function = cpp_hash_id.HASH_MURMUR3
with nogil:
c_result = move(murmurhash3_x86_32(c_source_view, seed))
elif method == "md5":
c_hash_function = cpp_hash_id.HASH_MD5
with nogil:
c_result = move(md5(c_source_view))
elif method == "xxhash64":
with nogil:
c_result = move(xxhash_64(c_source_view, seed))
else:
raise ValueError(f"Unsupported hash function: {method}")
with nogil:
c_result = move(
cpp_hash(
c_source_view,
c_hash_function,
seed
)
)

return Column.from_unique_ptr(move(c_result))
22 changes: 11 additions & 11 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1734,16 +1734,17 @@ def hash_values(self, method="murmur3", seed=None):

Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.

* murmur3: MurmurHash3 hash function
* md5: MD5 hash function
* xxhash64: xxHash64 hash function

seed : int, optional
Seed value to use for the hash function.
Note - This only has effect for the following supported
hash functions:
* murmur3: MurmurHash3 hash function.
Seed value to use for the hash function. This parameter is only
supported for 'murmur3' and 'xxhash64'.


Returns
-------
Expand Down Expand Up @@ -1797,14 +1798,13 @@ def hash_values(self, method="murmur3", seed=None):
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
seed_hash_methods = {"murmur3"}
seed_hash_methods = {"murmur3", "xxhash64"}
if seed is None:
seed = 0
elif method not in seed_hash_methods:
warnings.warn(
"Provided seed value has no effect for hash method"
f" `{method}`. Refer to the docstring for information"
" on hash methods that support the `seed` param"
"Provided seed value has no effect for the hash method "
f"`{method}`. Only {seed_hash_methods} support seeds."
)
# Note that both Series and DataFrame return Series objects from this
# calculation, necessitating the unfortunate circular reference to the
Expand Down
88 changes: 69 additions & 19 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

import array as arr
import contextlib
import datetime
import decimal
import io
Expand Down Expand Up @@ -1389,43 +1390,46 @@ def test_assign_callable(mapping):


@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
@pytest.mark.parametrize("method", ["murmur3", "md5"])
@pytest.mark.parametrize("method", ["murmur3", "md5", "xxhash64"])
@pytest.mark.parametrize("seed", [None, 42])
def test_dataframe_hash_values(nrows, method, seed):
warning_expected = seed is not None and method not in {
"murmur3",
"xxhash64",
}
potential_warning = (
pytest.warns(UserWarning, match="Provided seed value has no effect*")
if warning_expected
else contextlib.nullcontext()
)

gdf = cudf.DataFrame()
data = np.arange(nrows)
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out = gdf.hash_values()
with potential_warning:
out = gdf.hash_values(method=method, seed=seed)
assert isinstance(out, cudf.Series)
assert len(out) == nrows
assert out.dtype == np.uint32
expected_dtypes = {
"murmur3": np.uint32,
"md5": object,
"xxhash64": np.uint64,
}
assert out.dtype == expected_dtypes[method]

warning_expected = (
True if seed is not None and method not in {"murmur3"} else False
)
# Check single column
if warning_expected:
with pytest.warns(
UserWarning, match="Provided seed value has no effect*"
):
out_one = gdf[["a"]].hash_values(method=method, seed=seed)
else:
with potential_warning:
out_one = gdf[["a"]].hash_values(method=method, seed=seed)
# First matches last
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
if warning_expected:
with pytest.warns(
UserWarning, match="Provided seed value has no effect*"
):
assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
else:
with potential_warning:
assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)


@pytest.mark.parametrize("method", ["murmur3"])
@pytest.mark.parametrize("method", ["murmur3", "xxhash64"])
def test_dataframe_hash_values_seed(method):
gdf = cudf.DataFrame()
data = np.arange(10)
Expand All @@ -1439,6 +1443,52 @@ def test_dataframe_hash_values_seed(method):
assert_neq(out_one, out_two)


def test_dataframe_hash_values_xxhash64():
# xxhash64 has no built-in implementation in Python and we don't want to
# add a testing dependency, so we use regression tests against known good
# values.
gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]})
gdf["b"] = -gdf["a"]
out_a = gdf["a"].hash_values(method="xxhash64", seed=0)
expected_a = cudf.Series(
[
3803688792395291579,
10706502109028787093,
9835943264235290955,
18031741628920313605,
18446744073709551615,
],
dtype=np.uint64,
)
assert_eq(out_a, expected_a)

out_b = gdf["b"].hash_values(method="xxhash64", seed=42)
expected_b = cudf.Series(
[
9826995235083043316,
10150515573749944095,
5005707091092326006,
5326262080505358431,
18446744073709551615,
],
dtype=np.uint64,
)
assert_eq(out_b, expected_b)

out_df = gdf.hash_values(method="xxhash64", seed=0)
expected_df = cudf.Series(
[
10208049663714815266,
4949201786888768834,
18122173653994477335,
11133539368563441730,
18446744073709551615,
],
dtype=np.uint64,
)
assert_eq(out_df, expected_df)


@pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
@pytest.mark.parametrize("nparts", [1, 2, 8, 13])
@pytest.mark.parametrize("nkeys", [1, 2])
Expand Down