Skip to content

Commit

Permalink
Minor cleanup of unused Python functions (#9974)
Browse files Browse the repository at this point in the history
This PR just removes some unused internal functions and inlines some single-use functions that were defined at the wrong levels of the class hierarchy (largely `Frame` internal methods that were exclusively called in a single `DataFrame` method).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9974
  • Loading branch information
vyasr authored Jan 6, 2022
1 parent b1de945 commit a61fc55
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 245 deletions.
47 changes: 7 additions & 40 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4078,45 +4078,6 @@ def apply_chunks(
tpb=tpb,
)

def hash_values(self, method="murmur3"):
"""Compute the hash of values in each row.
Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.
Returns
-------
Series
A Series with hash values.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
>>> df
a b
0 10 0.00
1 120 0.25
2 30 0.50
>>> df.hash_values(method="murmur3")
0 -330519225
1 -397962448
2 -1345834934
dtype: int32
>>> df.hash_values(method="md5")
0 57ce879751b5169c525907d5c563fae1
1 948d6221a7c4963d4be411bcead7e32b
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
return Series._from_data(
{None: self._hash(method=method)}, index=self.index
)

def partition_by_hash(self, columns, nparts, keep_index=True):
"""Partition the dataframe by the hashed value of data in *columns*.
Expand All @@ -4140,7 +4101,13 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
else self._index._num_columns
)
key_indices = [self._data.names.index(k) + idx for k in columns]
outdf, offsets = self._hash_partition(key_indices, nparts, keep_index)

output_data, output_index, offsets = libcudf.hash.hash_partition(
self, key_indices, nparts, keep_index
)
outdf = self.__class__._from_data(output_data, output_index)
outdf._copy_type_metadata(self, include_index=keep_index)

# Slice into partition
return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])]

Expand Down
91 changes: 22 additions & 69 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,6 @@ def __init__(self, data=None, index=None):
def _num_columns(self) -> int:
return len(self._data)

@property
def _num_indices(self) -> int:
if self._index is None:
return 0
else:
return len(self._index_names)

@property
def _num_rows(self) -> int:
if self._index is not None:
Expand Down Expand Up @@ -269,15 +262,6 @@ def shape(self):
"""Returns a tuple representing the dimensionality of the DataFrame."""
return self._num_rows, self._num_columns

@property
def _is_homogeneous(self):
# make sure that the dataframe has columns
if not self._data.columns:
return True

first_type = self._data.columns[0].dtype.name
return all(x.dtype.name == first_type for x in self._data.columns)

@property
def empty(self):
"""
Expand Down Expand Up @@ -580,19 +564,6 @@ def _gather(
result._copy_type_metadata(self)
return result

def _hash(self, method):
return libcudf.hash.hash(self, method)

def _hash_partition(
self, columns_to_hash, num_partitions, keep_index=True
):
output_data, output_index, offsets = libcudf.hash.hash_partition(
self, columns_to_hash, num_partitions, keep_index
)
output = self.__class__._from_data(output_data, output_index)
output._copy_type_metadata(self, include_index=keep_index)
return output, offsets

def _as_column(self):
"""
_as_column : Converts a single columned Frame to Column
Expand Down Expand Up @@ -1009,30 +980,6 @@ def mask(self, cond, other=None, inplace=False):

return self.where(cond=~cond, other=other, inplace=inplace)

def _partition(self, scatter_map, npartitions, keep_index=True):

data, index, output_offsets = libcudf.partitioning.partition(
self, scatter_map, npartitions, keep_index
)
partitioned = self.__class__._from_data(data, index)

# due to the split limitation mentioned
# here: https://github.com/rapidsai/cudf/issues/4607
# we need to remove first & last elements in offsets.
# TODO: Remove this after the above issue is fixed.
output_offsets = output_offsets[1:-1]

result = partitioned._split(output_offsets, keep_index=keep_index)

for frame in result:
frame._copy_type_metadata(self, include_index=keep_index)

if npartitions:
for _ in range(npartitions - len(result)):
result.append(self._empty_like(keep_index))

return result

def pipe(self, func, *args, **kwargs):
"""
Apply ``func(self, *args, **kwargs)``.
Expand Down Expand Up @@ -1139,9 +1086,29 @@ def scatter_by_map(
f"ERROR: map_size must be >= {count} (got {map_size})."
)

tables = self._partition(map_index, map_size, keep_index)
data, index, output_offsets = libcudf.partitioning.partition(
self, map_index, map_size, keep_index
)
partitioned = self.__class__._from_data(data, index)

return tables
# due to the split limitation mentioned
# here: https://github.com/rapidsai/cudf/issues/4607
# we need to remove first & last elements in offsets.
# TODO: Remove this after the above issue is fixed.
output_offsets = output_offsets[1:-1]

result = partitioned._split(output_offsets, keep_index=keep_index)

for frame in result:
frame._copy_type_metadata(self, include_index=keep_index)

if map_size:
result += [
self._empty_like(keep_index)
for _ in range(map_size - len(result))
]

return result

def dropna(
self, axis=0, how="any", thresh=None, subset=None, inplace=False
Expand Down Expand Up @@ -1499,8 +1466,6 @@ def _apply_boolean_mask(self, boolean_mask):
Applies boolean mask to each row of `self`,
rows corresponding to `False` is dropped
"""
boolean_mask = as_column(boolean_mask)

result = self.__class__._from_data(
*libcudf.stream_compaction.apply_boolean_mask(
self, as_column(boolean_mask)
Expand Down Expand Up @@ -2503,18 +2468,6 @@ def _copy_type_metadata(

return self

def _copy_interval_data(self, other, include_index=True):
for name, col, other_col in zip(
self._data.keys(), self._data.values(), other._data.values()
):
if isinstance(other_col, cudf.core.column.IntervalColumn):
self._data[name] = cudf.core.column.IntervalColumn(col)

def _postprocess_columns(self, other, include_index=True):
self._copy_categories(other, include_index=include_index)
self._copy_struct_names(other, include_index=include_index)
self._copy_interval_data(other, include_index=include_index)

def isnull(self):
"""
Identify missing values.
Expand Down
64 changes: 64 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,70 @@ def sort_index(
out = out.reset_index(drop=True)
return self._mimic_inplace(out, inplace=inplace)

def hash_values(self, method="murmur3"):
"""Compute the hash of values in this column.
Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.
Returns
-------
Series
A Series with hash values.
Examples
--------
**Series**
>>> import cudf
>>> series = cudf.Series([10, 120, 30])
>>> series
0 10
1 120
2 30
dtype: int64
>>> series.hash_values(method="murmur3")
0 -1930516747
1 422619251
2 -941520876
dtype: int32
>>> series.hash_values(method="md5")
0 7be4bbacbfdb05fb3044e36c22b41e8b
1 947ca8d2c5f0f27437f156cfbfab0969
2 d0580ef52d27c043c8e341fd5039b166
dtype: object
**DataFrame**
>>> import cudf
>>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
>>> df
a b
0 10 0.00
1 120 0.25
2 30 0.50
>>> df.hash_values(method="murmur3")
0 -330519225
1 -397962448
2 -1345834934
dtype: int32
>>> df.hash_values(method="md5")
0 57ce879751b5169c525907d5c563fae1
1 948d6221a7c4963d4be411bcead7e32b
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
# Note that both Series and DataFrame return Series objects from this
# calculation, necessitating the unfortunate circular reference to the
# child class here.
return cudf.Series._from_data(
{None: libcudf.hash.hash(self, method)}, index=self.index
)

def _gather(
self, gather_map, keep_index=True, nullify=False, check_bounds=True
):
Expand Down
39 changes: 0 additions & 39 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3043,45 +3043,6 @@ def value_counts(
res = res / float(res._column.sum())
return res

def hash_values(self, method="murmur3"):
"""Compute the hash of values in this column.
Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.
Returns
-------
Series
A Series with hash values.
Examples
--------
>>> import cudf
>>> series = cudf.Series([10, 120, 30])
>>> series
0 10
1 120
2 30
dtype: int64
>>> series.hash_values(method="murmur3")
0 -1930516747
1 422619251
2 -941520876
dtype: int32
>>> series.hash_values(method="md5")
0 7be4bbacbfdb05fb3044e36c22b41e8b
1 947ca8d2c5f0f27437f156cfbfab0969
2 d0580ef52d27c043c8e341fd5039b166
dtype: object
"""
return Series._from_data(
{None: self._hash(method=method)}, index=self.index
)

def quantile(
self, q=0.5, interpolation="linear", exact=True, quant_index=True
):
Expand Down
Loading

0 comments on commit a61fc55

Please sign in to comment.