From 174c8285d117c498dfb6dfdefe36ac9b12a9f93d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 4 Oct 2021 11:51:42 -0700 Subject: [PATCH 01/12] Add MD5 API to Python. --- python/cudf/cudf/core/dataframe.py | 14 +++++++++++++- python/cudf/cudf/core/series.py | 17 ++++++++++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6d41d90ab47..15f48d7499f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5003,7 +5003,7 @@ def apply_chunks( tpb=tpb, ) - def hash_columns(self, columns=None): + def hash_columns(self, columns=None, method="murmur3"): """Hash the given *columns* and return a new device array Parameters @@ -5011,7 +5011,19 @@ def hash_columns(self, columns=None): columns : sequence of str; optional Sequence of column names. If columns is *None* (unspecified), all columns in the frame are used. + method : {'murmur3', 'md5'}, default 'murmur3' + Hash function to use: + * murmur3: MurmurHash3 hash function. + * md5: MD5 hash function. + + Returns + ------- + Series + Hash values for each row. """ + if method not in {"murmur3", "md5"}: + raise ValueError(f"Unsupported hash function: {method}") + if columns is None: table_to_hash = self else: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8188290c392..5b0c1e84552 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4095,13 +4095,20 @@ def floor(self): """ return self._unaryop("floor") - def hash_values(self): + def hash_values(self, method="murmur3"): """Compute the hash of values in this column. + Parameters + ---------- + method : {'murmur3', 'md5'}, default 'murmur3' + Hash function to use: + * murmur3: MurmurHash3 hash function. + * md5: MD5 hash function. + Returns ------- - cupy array - A cupy array with hash values. + Series + A Series with hash values. Examples -------- @@ -4112,10 +4119,10 @@ def hash_values(self): 1 120 2 30 dtype: int64 - >>> series.hash_values() + >>> series.hash_values(method="murmur3") array([-1930516747, 422619251, -941520876], dtype=int32) """ - return Series(self._hash()).values + return Series(self._hash()) def hash_encode(self, stop, use_name=False): """Encode column values as ints in [0, stop) using hash function. From 873d8e4a4ef60638a1d9f9aaf04c1dc5b952ad7d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 4 Oct 2021 11:58:33 -0700 Subject: [PATCH 02/12] Return Series. --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 15f48d7499f..1869f84114c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5030,7 +5030,7 @@ def hash_columns(self, columns=None, method="murmur3"): cols = [self[k]._column for k in columns] table_to_hash = Frame(data=dict(zip(columns, cols))) - return Series(table_to_hash._hash()).values + return Series(table_to_hash._hash()) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. From 6525b7e73078aacf57b3fec83b7053afa468be0e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Oct 2021 10:19:17 -0700 Subject: [PATCH 03/12] Add hash method parameter. --- python/cudf/cudf/_lib/hash.pyx | 11 ++++++++--- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/core/series.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index ae0116e635b..b50dd0fa804 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -54,17 +54,22 @@ def hash_partition(source_table, object columns_to_hash, ) -def hash(source_table, object initial_hash_values=None, int seed=0): +def hash(source_table, str method, object initial_hash_values=None, + int seed=0): cdef vector[uint32_t] c_initial_hash = initial_hash_values or [] cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) - cdef unique_ptr[column] c_result + cdef libcudf_types.hash_id c_hash_id + if method == "murmur3": + c_hash_function = libcudf_types.hash_id.HASH_MURMUR3 + else: + raise ValueError(f"Unsupported hash function: {method}") with nogil: c_result = move( cpp_hash( c_source_view, - libcudf_types.hash_id.HASH_MURMUR3, + c_hash_function, c_initial_hash, seed ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 12fe7f313eb..463dc16b840 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False): result._index.names = self._index.names return result - def _hash(self, initial_hash_values=None): - return libcudf.hash.hash(self, initial_hash_values) + def _hash(self, method, initial_hash_values=None): + return libcudf.hash.hash(self, method, initial_hash_values) def _hash_partition( self, columns_to_hash, num_partitions, keep_index=True diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5b0c1e84552..951c7756537 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4122,7 +4122,7 @@ def hash_values(self, method="murmur3"): >>> series.hash_values(method="murmur3") array([-1930516747, 422619251, -941520876], dtype=int32) """ - return Series(self._hash()) + return Series(self._hash(method=method)) def hash_encode(self, stop, use_name=False): """Encode column values as ints in [0, stop) using hash function. From 10354821eca639b067aab1af86c4d4b2b54de9f2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Oct 2021 11:12:54 -0700 Subject: [PATCH 04/12] Add MD5 API. --- python/cudf/cudf/_lib/hash.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index b50dd0fa804..6892831620f 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -63,6 +63,8 @@ def hash(source_table, str method, object initial_hash_values=None, cdef libcudf_types.hash_id c_hash_id if method == "murmur3": c_hash_function = libcudf_types.hash_id.HASH_MURMUR3 + elif method == "md5": + c_hash_function = libcudf_types.hash_id.HASH_MD5 else: raise ValueError(f"Unsupported hash function: {method}") with nogil: From 6a70c54332fca180b17d3cb5503696e39ffd3fdd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Oct 2021 11:56:42 -0700 Subject: [PATCH 05/12] Update hash API internals. --- python/cudf/cudf/_lib/hash.pyx | 4 ++-- python/cudf/cudf/core/dataframe.py | 5 +---- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/core/series.py | 4 +++- python/cudf/cudf/tests/test_dataframe.py | 13 +++++++------ 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 6892831620f..dd25e01a9e7 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -54,9 +54,9 @@ def hash_partition(source_table, object columns_to_hash, ) -def hash(source_table, str method, object initial_hash_values=None, +def hash(source_table, str method, object initial_hash=None, int seed=0): - cdef vector[uint32_t] c_initial_hash = initial_hash_values or [] + cdef vector[uint32_t] c_initial_hash = initial_hash or [] cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1869f84114c..9fb29309672 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5021,16 +5021,13 @@ def hash_columns(self, columns=None, method="murmur3"): Series Hash values for each row. """ - if method not in {"murmur3", "md5"}: - raise ValueError(f"Unsupported hash function: {method}") - if columns is None: table_to_hash = self else: cols = [self[k]._column for k in columns] table_to_hash = Frame(data=dict(zip(columns, cols))) - return Series(table_to_hash._hash()) + return Series(table_to_hash._hash(method=method)) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 463dc16b840..581ae6a5204 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False): result._index.names = self._index.names return result - def _hash(self, method, initial_hash_values=None): - return libcudf.hash.hash(self, method, initial_hash_values) + def _hash(self, method, initial_hash=None): + return libcudf.hash.hash(self, method, initial_hash) def _hash_partition( self, columns_to_hash, num_partitions, keep_index=True diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 951c7756537..554f7535c30 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4165,7 +4165,9 @@ def hash_encode(self, stop, use_name=False): raise ValueError("stop must be a positive integer.") initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None - hashed_values = Series(self._hash(initial_hash)) + hashed_values = Series( + self._hash(method="murmur3", initial_hash=initial_hash) + ) if hashed_values.has_nulls: raise ValueError("Column must have no nulls.") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5a839507182..b42bfa6d2e2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1103,27 +1103,28 @@ def test_assign(): @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) -def test_dataframe_hash_columns(nrows): +@pytest.mark.parametrize("method", ["murmur3", "md5"]) +def test_dataframe_hash_columns(nrows, method): gdf = cudf.DataFrame() data = np.asarray(range(nrows)) data[0] = data[-1] # make first and last the same gdf["a"] = data gdf["b"] = gdf.a + 100 out = gdf.hash_columns(["a", "b"]) - assert isinstance(out, cupy.ndarray) + # assert isinstance(out, cupy.ndarray) assert len(out) == nrows assert out.dtype == np.int32 # Check default out_all = gdf.hash_columns() - np.testing.assert_array_equal(cupy.asnumpy(out), cupy.asnumpy(out_all)) + assert_eq(out, out_all) # Check single column - out_one = cupy.asnumpy(gdf.hash_columns(["a"])) + out_one = gdf.hash_columns(["a"], method=method) # First matches last - assert out_one[0] == out_one[-1] + assert out_one.iloc[0] == out_one.iloc[-1] # Equivalent to the cudf.Series.hash_values() - np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one) + assert_eq(gdf["a"].hash_values(method=method), out_one) @pytest.mark.parametrize("nrows", [3, 10, 100, 1000]) From eda11339ca1b63baefa4ee171b3ad4513ed250b2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Oct 2021 12:05:12 -0700 Subject: [PATCH 06/12] Fix variable name. --- python/cudf/cudf/_lib/hash.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index dd25e01a9e7..f15052f33e9 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -60,7 +60,7 @@ def hash(source_table, str method, object initial_hash=None, cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) cdef unique_ptr[column] c_result - cdef libcudf_types.hash_id c_hash_id + cdef libcudf_types.hash_id c_hash_function if method == "murmur3": c_hash_function = libcudf_types.hash_id.HASH_MURMUR3 elif method == "md5": From 2c60587884c35338863bea673fb25ee693f6ae7d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Oct 2021 12:09:29 -0700 Subject: [PATCH 07/12] Check hash results are a Series. --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b42bfa6d2e2..5a9fcb615e5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1111,7 +1111,7 @@ def test_dataframe_hash_columns(nrows, method): gdf["a"] = data gdf["b"] = gdf.a + 100 out = gdf.hash_columns(["a", "b"]) - # assert isinstance(out, cupy.ndarray) + assert isinstance(out, cudf.Series) assert len(out) == nrows assert out.dtype == np.int32 From 8962cf62278c4f7692bec93ab73b6a0f61db29c8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Oct 2021 12:19:41 -0700 Subject: [PATCH 08/12] Wrap line. --- python/cudf/cudf/_lib/hash.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index f15052f33e9..9b34a049cac 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -54,8 +54,7 @@ def hash_partition(source_table, object columns_to_hash, ) -def hash(source_table, str method, object initial_hash=None, - int seed=0): +def hash(source_table, str method, object initial_hash=None, int seed=0): cdef vector[uint32_t] c_initial_hash = initial_hash or [] cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) From bce219be88fed9afcbc82e83b51ba0c3323c4808 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Oct 2021 07:40:05 -0700 Subject: [PATCH 09/12] Use Series._from_data. --- python/cudf/cudf/core/dataframe.py | 4 +++- python/cudf/cudf/core/series.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9fb29309672..7c706fda177 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5027,7 +5027,9 @@ def hash_columns(self, columns=None, method="murmur3"): cols = [self[k]._column for k in columns] table_to_hash = Frame(data=dict(zip(columns, cols))) - return Series(table_to_hash._hash(method=method)) + return Series._from_data( + {None: table_to_hash._hash(method=method)}, index=self.index + ) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 554f7535c30..f4352147f45 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4122,7 +4122,9 @@ def hash_values(self, method="murmur3"): >>> series.hash_values(method="murmur3") array([-1930516747, 422619251, -941520876], dtype=int32) """ - return Series(self._hash(method=method)) + return Series._from_data( + {None: self._hash(method=method)}, index=self.index + ) def hash_encode(self, stop, use_name=False): """Encode column values as ints in [0, stop) using hash function. From c012707597fca1ff07b619daf900cd1078414029 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Oct 2021 07:43:41 -0700 Subject: [PATCH 10/12] Use self._data, simplify expression. --- python/cudf/cudf/core/dataframe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7c706fda177..0b4f77d4f93 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5021,11 +5021,11 @@ def hash_columns(self, columns=None, method="murmur3"): Series Hash values for each row. """ - if columns is None: - table_to_hash = self - else: - cols = [self[k]._column for k in columns] - table_to_hash = Frame(data=dict(zip(columns, cols))) + table_to_hash = ( + self + if columns is None + else Frame(data={k: self._data[k] for k in columns}) + ) return Series._from_data( {None: table_to_hash._hash(method=method)}, index=self.index From 0bd1a562214cbc7c9deb8e6112c0e7b379e7d3e7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Oct 2021 08:06:17 -0700 Subject: [PATCH 11/12] Use Series._from_data in hash_encode. --- python/cudf/cudf/core/series.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f4352147f45..1cd45706a1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4167,15 +4167,19 @@ def hash_encode(self, stop, use_name=False): raise ValueError("stop must be a positive integer.") initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None - hashed_values = Series( - self._hash(method="murmur3", initial_hash=initial_hash) + hashed_values = Series._from_data( + { + self.name: self._hash( + method="murmur3", initial_hash=initial_hash + ) + }, + self.index, ) if hashed_values.has_nulls: raise ValueError("Column must have no nulls.") - mod_vals = hashed_values % stop - return Series(mod_vals._column, index=self.index, name=self.name) + return hashed_values % stop def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True From 29de972590c5561ae6a16c76e93ce3e3ddecc9a3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Oct 2021 19:16:44 -0700 Subject: [PATCH 12/12] Add tests for MD5 Python API. --- python/cudf/cudf/tests/test_series.py | 44 +++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ca179703864..cd18177fcf0 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1272,3 +1272,47 @@ def test_series_sort_index( assert_eq(ps, gs, check_index_type=True) else: assert_eq(expected, got, check_index_type=True) + + +@pytest.mark.parametrize( + "method,validation_data", + [ + ( + "md5", + [ + "d41d8cd98f00b204e9800998ecf8427e", + "cfcd208495d565ef66e7dff9f98764da", + "3d3aaae21d57b101227f0384f644abe0", + "3e76c7023d771ad1c1520c27ab3d4874", + "f8d805e33ec3ade1a6ea251ac1c118e7", + "c30515f66a5aec7af7666abf33600c92", + "c61a4185135eda043f35e92c3505e180", + "52da74c75cb6575d25be29e66bd0adde", + "5152ac13bdd09110d9ee9c169a3d9237", + "f1d3ff8443297732862df21dc4e57262", + ], + ) + ], +) +def test_series_hash_values(method, validation_data): + inputs = cudf.Series( + [ + "", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + ( + "A very long (greater than 128 bytes/char string) to execute " + "a multi hash-step data point in the hash function being " + "tested. This string needed to be longer." + ), + "All work and no play makes Jack a dull boy", + "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "\x00\x00\x00\x10\x00\x00\x00\x00", + "\x00\x00\x00\x00", + ] + ) + validation_results = cudf.Series(validation_data) + hash_values = inputs.hash_values(method=method) + assert_eq(hash_values, validation_results)