From a61fc558531693f68e18ad29dc0b73610c5d1c70 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 6 Jan 2022 10:44:09 -0600 Subject: [PATCH] Minor cleanup of unused Python functions (#9974) This PR just removes some unused internal functions and inlines some single-use functions that were defined at the wrong levels of the class hierarchy (largely `Frame` internal methods that were exclusively called in a single `DataFrame` method). Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/9974 --- python/cudf/cudf/core/dataframe.py | 47 ++---------- python/cudf/cudf/core/frame.py | 91 ++++++---------------- python/cudf/cudf/core/indexed_frame.py | 64 ++++++++++++++++ python/cudf/cudf/core/series.py | 39 ---------- python/cudf/cudf/tests/test_dataframe.py | 97 ------------------------ 5 files changed, 93 insertions(+), 245 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3366a0af4ba..197011e629d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4078,45 +4078,6 @@ def apply_chunks( tpb=tpb, ) - def hash_values(self, method="murmur3"): - """Compute the hash of values in each row. - - Parameters - ---------- - method : {'murmur3', 'md5'}, default 'murmur3' - Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. - - Returns - ------- - Series - A Series with hash values. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]}) - >>> df - a b - 0 10 0.00 - 1 120 0.25 - 2 30 0.50 - >>> df.hash_values(method="murmur3") - 0 -330519225 - 1 -397962448 - 2 -1345834934 - dtype: int32 - >>> df.hash_values(method="md5") - 0 57ce879751b5169c525907d5c563fae1 - 1 948d6221a7c4963d4be411bcead7e32b - 2 fe061786ea286a515b772d91b0dfcd70 - dtype: object - """ - return Series._from_data( - {None: self._hash(method=method)}, index=self.index - ) - def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4140,7 +4101,13 @@ def partition_by_hash(self, columns, nparts, keep_index=True): else self._index._num_columns ) key_indices = [self._data.names.index(k) + idx for k in columns] - outdf, offsets = self._hash_partition(key_indices, nparts, keep_index) + + output_data, output_index, offsets = libcudf.hash.hash_partition( + self, key_indices, nparts, keep_index + ) + outdf = self.__class__._from_data(output_data, output_index) + outdf._copy_type_metadata(self, include_index=keep_index) + # Slice into partition return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index bae15c5e9fd..539408b6afb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -83,13 +83,6 @@ def __init__(self, data=None, index=None): def _num_columns(self) -> int: return len(self._data) - @property - def _num_indices(self) -> int: - if self._index is None: - return 0 - else: - return len(self._index_names) - @property def _num_rows(self) -> int: if self._index is not None: @@ -269,15 +262,6 @@ def shape(self): """Returns a tuple representing the dimensionality of the DataFrame.""" return self._num_rows, self._num_columns - @property - def _is_homogeneous(self): - # make sure that the dataframe has columns - if not self._data.columns: - return True - - first_type = self._data.columns[0].dtype.name - return all(x.dtype.name == first_type for x in self._data.columns) - @property def empty(self): """ @@ -580,19 +564,6 @@ def _gather( result._copy_type_metadata(self) return result - def _hash(self, method): - return libcudf.hash.hash(self, method) - - def _hash_partition( - self, columns_to_hash, num_partitions, keep_index=True - ): - output_data, output_index, offsets = libcudf.hash.hash_partition( - self, columns_to_hash, num_partitions, keep_index - ) - output = self.__class__._from_data(output_data, output_index) - output._copy_type_metadata(self, include_index=keep_index) - return output, offsets - def _as_column(self): """ _as_column : Converts a single columned Frame to Column @@ -1009,30 +980,6 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) - def _partition(self, scatter_map, npartitions, keep_index=True): - - data, index, output_offsets = libcudf.partitioning.partition( - self, scatter_map, npartitions, keep_index - ) - partitioned = self.__class__._from_data(data, index) - - # due to the split limitation mentioned - # here: https://github.com/rapidsai/cudf/issues/4607 - # we need to remove first & last elements in offsets. - # TODO: Remove this after the above issue is fixed. - output_offsets = output_offsets[1:-1] - - result = partitioned._split(output_offsets, keep_index=keep_index) - - for frame in result: - frame._copy_type_metadata(self, include_index=keep_index) - - if npartitions: - for _ in range(npartitions - len(result)): - result.append(self._empty_like(keep_index)) - - return result - def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1139,9 +1086,29 @@ def scatter_by_map( f"ERROR: map_size must be >= {count} (got {map_size})." ) - tables = self._partition(map_index, map_size, keep_index) + data, index, output_offsets = libcudf.partitioning.partition( + self, map_index, map_size, keep_index + ) + partitioned = self.__class__._from_data(data, index) - return tables + # due to the split limitation mentioned + # here: https://github.com/rapidsai/cudf/issues/4607 + # we need to remove first & last elements in offsets. + # TODO: Remove this after the above issue is fixed. + output_offsets = output_offsets[1:-1] + + result = partitioned._split(output_offsets, keep_index=keep_index) + + for frame in result: + frame._copy_type_metadata(self, include_index=keep_index) + + if map_size: + result += [ + self._empty_like(keep_index) + for _ in range(map_size - len(result)) + ] + + return result def dropna( self, axis=0, how="any", thresh=None, subset=None, inplace=False @@ -1499,8 +1466,6 @@ def _apply_boolean_mask(self, boolean_mask): Applies boolean mask to each row of `self`, rows corresponding to `False` is dropped """ - boolean_mask = as_column(boolean_mask) - result = self.__class__._from_data( *libcudf.stream_compaction.apply_boolean_mask( self, as_column(boolean_mask) @@ -2503,18 +2468,6 @@ def _copy_type_metadata( return self - def _copy_interval_data(self, other, include_index=True): - for name, col, other_col in zip( - self._data.keys(), self._data.values(), other._data.values() - ): - if isinstance(other_col, cudf.core.column.IntervalColumn): - self._data[name] = cudf.core.column.IntervalColumn(col) - - def _postprocess_columns(self, other, include_index=True): - self._copy_categories(other, include_index=include_index) - self._copy_struct_names(other, include_index=include_index) - self._copy_interval_data(other, include_index=include_index) - def isnull(self): """ Identify missing values. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4be35d960ee..ecacb1ff326 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -445,6 +445,70 @@ def sort_index( out = out.reset_index(drop=True) return self._mimic_inplace(out, inplace=inplace) + def hash_values(self, method="murmur3"): + """Compute the hash of values in this column. + + Parameters + ---------- + method : {'murmur3', 'md5'}, default 'murmur3' + Hash function to use: + * murmur3: MurmurHash3 hash function. + * md5: MD5 hash function. + + Returns + ------- + Series + A Series with hash values. + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([10, 120, 30]) + >>> series + 0 10 + 1 120 + 2 30 + dtype: int64 + >>> series.hash_values(method="murmur3") + 0 -1930516747 + 1 422619251 + 2 -941520876 + dtype: int32 + >>> series.hash_values(method="md5") + 0 7be4bbacbfdb05fb3044e36c22b41e8b + 1 947ca8d2c5f0f27437f156cfbfab0969 + 2 d0580ef52d27c043c8e341fd5039b166 + dtype: object + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]}) + >>> df + a b + 0 10 0.00 + 1 120 0.25 + 2 30 0.50 + >>> df.hash_values(method="murmur3") + 0 -330519225 + 1 -397962448 + 2 -1345834934 + dtype: int32 + >>> df.hash_values(method="md5") + 0 57ce879751b5169c525907d5c563fae1 + 1 948d6221a7c4963d4be411bcead7e32b + 2 fe061786ea286a515b772d91b0dfcd70 + dtype: object + """ + # Note that both Series and DataFrame return Series objects from this + # calculation, necessitating the unfortunate circular reference to the + # child class here. + return cudf.Series._from_data( + {None: libcudf.hash.hash(self, method)}, index=self.index + ) + def _gather( self, gather_map, keep_index=True, nullify=False, check_bounds=True ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 178c40b3cd8..a0e359d1278 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3043,45 +3043,6 @@ def value_counts( res = res / float(res._column.sum()) return res - def hash_values(self, method="murmur3"): - """Compute the hash of values in this column. - - Parameters - ---------- - method : {'murmur3', 'md5'}, default 'murmur3' - Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. - - Returns - ------- - Series - A Series with hash values. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 120, 30]) - >>> series - 0 10 - 1 120 - 2 30 - dtype: int64 - >>> series.hash_values(method="murmur3") - 0 -1930516747 - 1 422619251 - 2 -941520876 - dtype: int32 - >>> series.hash_values(method="md5") - 0 7be4bbacbfdb05fb3044e36c22b41e8b - 1 947ca8d2c5f0f27437f156cfbfab0969 - 2 d0580ef52d27c043c8e341fd5039b166 - dtype: object - """ - return Series._from_data( - {None: self._hash(method=method)}, index=self.index - ) - def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True ): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f42920b7c50..73f9cb858e1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8696,103 +8696,6 @@ def test_dataframe_init_from_series(data, columns, index): ) -@pytest.mark.parametrize( - "data, expected", - [ - ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False), - ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), - ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), - ({"a": [True, False, False], "b": [False, False, True]}, True), - ({"a": [True, False, False]}, True), - ({"a": [[1, 2], [3, 4]]}, True), - ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False), - ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True), - ({}, True), - ], -) -def test_is_homogeneous_dataframe(data, expected): - actual = cudf.DataFrame(data)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "data, indexes, expected", - [ - ( - {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, - ["a", "b"], - True, - ), - ( - { - "a": [1, 2, 3, 4], - "b": [5, 6, 7, 8], - "c": [1.2, 1, 2, 3], - "d": ["hello", "world", "cudf", "rapids"], - }, - ["a", "b"], - False, - ), - ( - { - "a": ["a", "b", "c"], - "b": [4, 5, 6], - "c": [7, 8, 9], - "d": [1, 2, 3], - }, - ["a", "b"], - True, - ), - ], -) -def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected): - test_dataframe = cudf.DataFrame(data).set_index(indexes) - actual = cudf.DataFrame(test_dataframe)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "data, expected", [([1, 2, 3, 4], True), ([True, False], True)] -) -def test_is_homogeneous_series(data, expected): - actual = cudf.Series(data)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "levels, codes, expected", - [ - ( - [["lama", "cow", "falcon"], ["speed", "weight", "length"]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - True, - ), - ( - [[1, 2, 3], [True, False, True]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - False, - ), - ], -) -def test_is_homogeneous_multiIndex(levels, codes, expected): - actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous - - assert actual == expected - - -@pytest.mark.parametrize( - "data, expected", - [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)], -) -def test_is_homogeneous_index(data, expected): - actual = cudf.Index(data)._is_homogeneous - - assert actual == expected - - def test_frame_series_where(): gdf = cudf.DataFrame( {"a": [1.0, 2.0, None, 3.0, None], "b": [None, 10.0, 11.0, None, 23.0]}