diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index ac5e1dbe9c4..4b9fe3f3cee 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -1,6 +1,6 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -9,16 +9,18 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: +cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: - ctypedef enum hash_id "cudf::hash_id": - HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" - HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" - HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" - HASH_MD5 "cudf::hash_id::HASH_MD5" - - cdef unique_ptr[column] hash "cudf::hash" ( + cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" ( const table_view& input, - const hash_id hash_function, const uint32_t seed ) except + + + cdef unique_ptr[column] md5 "cudf::hashing::md5" ( + const table_view& input + ) except + + + cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" ( + const table_view& input, + const uint64_t seed + ) except + diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 1264a9b2126..a4741239cf3 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,7 +10,7 @@ from libcpp.vector cimport vector cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id +from cudf._lib.cpp.hash cimport md5, murmurhash3_x86_32, xxhash_64 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -44,20 +44,15 @@ def hash_partition(list source_columns, object columns_to_hash, def hash(list source_columns, str method, int seed=0): cdef table_view c_source_view = table_view_from_columns(source_columns) cdef unique_ptr[column] c_result - cdef cpp_hash_id c_hash_function if method == "murmur3": - c_hash_function = cpp_hash_id.HASH_MURMUR3 + with nogil: + c_result = move(murmurhash3_x86_32(c_source_view, seed)) elif method == "md5": - c_hash_function = cpp_hash_id.HASH_MD5 + with nogil: + c_result = move(md5(c_source_view)) + elif method == "xxhash64": + with nogil: + c_result = move(xxhash_64(c_source_view, seed)) else: raise ValueError(f"Unsupported hash function: {method}") - with nogil: - c_result = move( - cpp_hash( - c_source_view, - c_hash_function, - seed - ) - ) - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 874cceea9af..0c23d6dd45b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1732,16 +1732,17 @@ def hash_values(self, method="murmur3", seed=None): Parameters ---------- - method : {'murmur3', 'md5'}, default 'murmur3' + method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3' Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. + + * murmur3: MurmurHash3 hash function + * md5: MD5 hash function + * xxhash64: xxHash64 hash function seed : int, optional - Seed value to use for the hash function. - Note - This only has effect for the following supported - hash functions: - * murmur3: MurmurHash3 hash function. + Seed value to use for the hash function. This parameter is only + supported for 'murmur3' and 'xxhash64'. + Returns ------- @@ -1795,14 +1796,13 @@ def hash_values(self, method="murmur3", seed=None): 2 fe061786ea286a515b772d91b0dfcd70 dtype: object """ - seed_hash_methods = {"murmur3"} + seed_hash_methods = {"murmur3", "xxhash64"} if seed is None: seed = 0 elif method not in seed_hash_methods: warnings.warn( - "Provided seed value has no effect for hash method" - f" `{method}`. Refer to the docstring for information" - " on hash methods that support the `seed` param" + "Provided seed value has no effect for the hash method " + f"`{method}`. Only {seed_hash_methods} support seeds." ) # Note that both Series and DataFrame return Series objects from this # calculation, necessitating the unfortunate circular reference to the diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6ddab6ed3f5..2a0edf09079 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. import array as arr +import contextlib import datetime import decimal import io @@ -1389,43 +1390,46 @@ def test_assign_callable(mapping): @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) -@pytest.mark.parametrize("method", ["murmur3", "md5"]) +@pytest.mark.parametrize("method", ["murmur3", "md5", "xxhash64"]) @pytest.mark.parametrize("seed", [None, 42]) def test_dataframe_hash_values(nrows, method, seed): + warning_expected = seed is not None and method not in { + "murmur3", + "xxhash64", + } + potential_warning = ( + pytest.warns(UserWarning, match="Provided seed value has no effect*") + if warning_expected + else contextlib.nullcontext() + ) + gdf = cudf.DataFrame() data = np.arange(nrows) data[0] = data[-1] # make first and last the same gdf["a"] = data gdf["b"] = gdf.a + 100 - out = gdf.hash_values() + with potential_warning: + out = gdf.hash_values(method=method, seed=seed) assert isinstance(out, cudf.Series) assert len(out) == nrows - assert out.dtype == np.uint32 + expected_dtypes = { + "murmur3": np.uint32, + "md5": object, + "xxhash64": np.uint64, + } + assert out.dtype == expected_dtypes[method] - warning_expected = ( - True if seed is not None and method not in {"murmur3"} else False - ) # Check single column - if warning_expected: - with pytest.warns( - UserWarning, match="Provided seed value has no effect*" - ): - out_one = gdf[["a"]].hash_values(method=method, seed=seed) - else: + with potential_warning: out_one = gdf[["a"]].hash_values(method=method, seed=seed) # First matches last assert out_one.iloc[0] == out_one.iloc[-1] # Equivalent to the cudf.Series.hash_values() - if warning_expected: - with pytest.warns( - UserWarning, match="Provided seed value has no effect*" - ): - assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) - else: + with potential_warning: assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) -@pytest.mark.parametrize("method", ["murmur3"]) +@pytest.mark.parametrize("method", ["murmur3", "xxhash64"]) def test_dataframe_hash_values_seed(method): gdf = cudf.DataFrame() data = np.arange(10) @@ -1439,6 +1443,52 @@ def test_dataframe_hash_values_seed(method): assert_neq(out_one, out_two) +def test_dataframe_hash_values_xxhash64(): + # xxhash64 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash64", seed=0) + expected_a = cudf.Series( + [ + 3803688792395291579, + 10706502109028787093, + 9835943264235290955, + 18031741628920313605, + 18446744073709551615, + ], + dtype=np.uint64, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash64", seed=42) + expected_b = cudf.Series( + [ + 9826995235083043316, + 10150515573749944095, + 5005707091092326006, + 5326262080505358431, + 18446744073709551615, + ], + dtype=np.uint64, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash64", seed=0) + expected_df = cudf.Series( + [ + 10208049663714815266, + 4949201786888768834, + 18122173653994477335, + 11133539368563441730, + 18446744073709551615, + ], + dtype=np.uint64, + ) + assert_eq(out_df, expected_df) + + @pytest.mark.parametrize("nrows", [3, 10, 100, 1000]) @pytest.mark.parametrize("nparts", [1, 2, 8, 13]) @pytest.mark.parametrize("nkeys", [1, 2])