Skip to content

Commit

Permalink
Remove various unused functions (#9922)
Browse files Browse the repository at this point in the history
This PR removes a number of unused functions and inlines some helpers that are only called in one place. This PR also deprecates `Series.fill`, which does not appear to be a pandas API. This PR resolves #9824.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Bradley Dice (https://github.com/bdice)

URL: #9922
  • Loading branch information
vyasr authored Jan 3, 2022
1 parent 67c925c commit 7233765
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 142 deletions.
143 changes: 33 additions & 110 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
_get_label_range_or_mask,
_indices_from_labels,
)
from cudf.core.multiindex import MultiIndex
from cudf.core.resample import DataFrameResampler
from cudf.core.series import Series
from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
Expand Down Expand Up @@ -90,8 +91,6 @@

class _DataFrameIndexer(_FrameIndexer):
def __getitem__(self, arg):
from cudf import MultiIndex

if isinstance(self._frame.index, MultiIndex) or isinstance(
self._frame.columns, MultiIndex
):
Expand All @@ -118,8 +117,6 @@ def _can_downcast_to_series(self, df, arg):
operation should be "downcasted" from a DataFrame to a
Series
"""
from cudf.core.column import as_column

if isinstance(df, cudf.Series):
return False
nrows, ncols = df.shape
Expand Down Expand Up @@ -201,11 +198,6 @@ def _getitem_scalar(self, arg):
def _getitem_tuple_arg(self, arg):
from uuid import uuid4

from cudf import MultiIndex
from cudf.core.column import column
from cudf.core.dataframe import DataFrame
from cudf.core.index import as_index

# Step 1: Gather columns
if isinstance(arg, tuple):
columns_df = self._frame._get_columns_by_label(arg[1])
Expand Down Expand Up @@ -245,7 +237,7 @@ def _getitem_tuple_arg(self, arg):
tmp_arg = ([tmp_arg[0]], tmp_arg[1])
if len(tmp_arg[0]) == 0:
return columns_df._empty_like(keep_index=True)
tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])
tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1])

if is_bool_dtype(tmp_arg[0]):
df = columns_df._apply_boolean_mask(tmp_arg[0])
Expand Down Expand Up @@ -273,7 +265,7 @@ def _getitem_tuple_arg(self, arg):
start = self._frame.index[0]
df.index = as_index(start)
else:
row_selection = column.as_column(arg[0])
row_selection = as_column(arg[0])
if is_bool_dtype(row_selection.dtype):
df.index = self._frame.index.take(row_selection)
else:
Expand All @@ -285,7 +277,7 @@ def _getitem_tuple_arg(self, arg):

@annotate("LOC_SETITEM", color="blue", domain="cudf_python")
def _setitem_tuple_arg(self, key, value):
if isinstance(self._frame.index, cudf.MultiIndex) or isinstance(
if isinstance(self._frame.index, MultiIndex) or isinstance(
self._frame.columns, pd.MultiIndex
):
raise NotImplementedError(
Expand Down Expand Up @@ -322,7 +314,7 @@ def _setitem_tuple_arg(self, key, value):
self._frame._data.insert(key[1], new_col)
else:
if isinstance(value, (cupy.ndarray, np.ndarray)):
value_df = cudf.DataFrame(value)
value_df = DataFrame(value)
if value_df.shape[1] != columns_df.shape[1]:
if value_df.shape[1] == 1:
value_cols = (
Expand Down Expand Up @@ -351,13 +343,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):

@annotate("ILOC_GETITEM", color="blue", domain="cudf_python")
def _getitem_tuple_arg(self, arg):
from cudf import MultiIndex
from cudf.core.column import column
from cudf.core.index import as_index

# Iloc Step 1:
# Gather the columns specified by the second tuple arg
columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1]))
columns_df = DataFrame(self._frame._get_columns_by_index(arg[1]))

columns_df._index = self._frame._index

Expand Down Expand Up @@ -385,7 +373,7 @@ def _getitem_tuple_arg(self, arg):
index += len(columns_df)
df = columns_df._slice(slice(index, index + 1, 1))
else:
arg = (column.as_column(arg[0]), arg[1])
arg = (as_column(arg[0]), arg[1])
if is_bool_dtype(arg[0]):
df = columns_df._apply_boolean_mask(arg[0])
else:
Expand All @@ -407,7 +395,7 @@ def _getitem_tuple_arg(self, arg):

@annotate("ILOC_SETITEM", color="blue", domain="cudf_python")
def _setitem_tuple_arg(self, key, value):
columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1]))
columns = DataFrame(self._frame._get_columns_by_index(key[1]))

for col in columns:
self._frame[col].iloc[key[0]] = value
Expand Down Expand Up @@ -953,6 +941,7 @@ def ndim(self):
return 2

def __dir__(self):
# Add the columns of the DataFrame to the dir output.
o = set(dir(type(self)))
o.update(self.__dict__)
o.update(
Expand Down Expand Up @@ -1169,8 +1158,6 @@ def _slice(self: T, arg: slice) -> T:
arg : should always be of type slice
"""
from cudf.core.index import RangeIndex

num_rows = len(self)
if num_rows == 0:
return self
Expand Down Expand Up @@ -1284,8 +1271,6 @@ def memory_usage(self, index=True, deep=False):
return Series(sizes, index=ind)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
import cudf

if method == "__call__" and hasattr(cudf, ufunc.__name__):
func = getattr(cudf, ufunc.__name__)
return func(self)
Expand Down Expand Up @@ -1329,6 +1314,7 @@ def __array_function__(self, func, types, args, kwargs):
else:
return NotImplemented

# The _get_numeric_data method is necessary for dask compatibility.
def _get_numeric_data(self):
"""Return a dataframe with only numeric data types"""
columns = [
Expand Down Expand Up @@ -1554,9 +1540,9 @@ def _concat(
out._index._data,
indices[:first_data_column_position],
)
if not isinstance(
out._index, cudf.MultiIndex
) and is_categorical_dtype(out._index._values.dtype):
if not isinstance(out._index, MultiIndex) and is_categorical_dtype(
out._index._values.dtype
):
out = out.set_index(
cudf.core.index.as_index(out.index._values)
)
Expand Down Expand Up @@ -1672,51 +1658,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):

return result

def _repr_pandas025_formatting(self, ncols, nrows, dtype=None):
"""
With Pandas > 0.25 there are some new conditional formatting for some
datatypes and column/row configurations. This fixes most of them in
context to match the expected Pandas repr of the same content.
Examples
--------
>>> gdf.__repr__()
0 ... 19
0 46 ... 48
.. .. ... ..
19 40 ... 29
[20 rows x 20 columns]
>>> nrows, ncols = _repr_pandas025_formatting(2, 2, dtype="category")
>>> pd.options.display.max_rows = nrows
>>> pd.options.display.max_columns = ncols
>>> gdf.__repr__()
0 ... 19
0 46 ... 48
.. .. ... ..
19 40 ... 29
[20 rows x 20 columns]
"""
ncols = 1 if ncols in [0, 2] and dtype == "datetime64[ns]" else ncols
ncols = (
1
if ncols == 0
and nrows == 1
and dtype in ["int8", "str", "category"]
else ncols
)
ncols = (
1
if nrows == 1
and dtype in ["int8", "int16", "int64", "str", "category"]
else ncols
)
ncols = 0 if ncols == 2 else ncols
ncols = 19 if ncols in [20, 21] else ncols
return ncols, nrows

def _clean_renderable_dataframe(self, output):
"""
This method takes in partial/preprocessed dataframe
Expand Down Expand Up @@ -1822,7 +1763,7 @@ def _get_renderable_dataframe(self):
# adjust right columns for output if multiindex.
right_cols = (
right_cols - 1
if isinstance(self.index, cudf.MultiIndex)
if isinstance(self.index, MultiIndex)
else right_cols
)
left_cols = int(ncols / 2.0) + 1
Expand Down Expand Up @@ -2151,20 +2092,6 @@ def columns(self, columns):
data, multiindex=is_multiindex, level_names=columns.names,
)

def _rename_columns(self, new_names):
old_cols = iter(self._data.names)
l_old_cols = len(self._data)
l_new_cols = len(new_names)
if l_new_cols != l_old_cols:
msg = (
f"Length of new column names: {l_new_cols} does not "
"match length of previous column names: {l_old_cols}"
)
raise ValueError(msg)

mapper = dict(zip(old_cols, new_names))
self.rename(mapper=mapper, inplace=True, axis=1)

def _reindex(
self, columns, dtypes=None, deep=False, index=None, inplace=False
):
Expand Down Expand Up @@ -2209,11 +2136,9 @@ def _reindex(
columns = (
columns if columns is not None else list(df._column_names)
)
df = cudf.DataFrame()
df = DataFrame()
else:
df = cudf.DataFrame(None, index).join(
df, how="left", sort=True
)
df = DataFrame(None, index).join(df, how="left", sort=True)
# double-argsort to map back from sorted to unsorted positions
df = df.take(index.argsort(ascending=True).argsort())

Expand Down Expand Up @@ -2445,7 +2370,7 @@ def set_index(
except TypeError:
msg = f"{col} cannot be converted to column-like."
raise TypeError(msg)
if isinstance(col, (cudf.MultiIndex, pd.MultiIndex)):
if isinstance(col, (MultiIndex, pd.MultiIndex)):
col = (
cudf.from_pandas(col)
if isinstance(col, pd.MultiIndex)
Expand Down Expand Up @@ -2473,7 +2398,7 @@ def set_index(

if append:
idx_cols = [self.index._data[x] for x in self.index._data]
if isinstance(self.index, cudf.MultiIndex):
if isinstance(self.index, MultiIndex):
idx_names = self.index.names
else:
idx_names = [self.index.name]
Expand All @@ -2485,7 +2410,7 @@ def set_index(
elif len(columns_to_add) == 1:
idx = cudf.Index(columns_to_add[0], name=names[0])
else:
idx = cudf.MultiIndex._from_data(
idx = MultiIndex._from_data(
{i: col for i, col in enumerate(columns_to_add)}
)
idx.names = names
Expand Down Expand Up @@ -2568,7 +2493,7 @@ class max_speed
result = self if inplace else self.copy()

if not drop:
if isinstance(self.index, cudf.MultiIndex):
if isinstance(self.index, MultiIndex):
names = tuple(
name if name is not None else f"level_{i}"
for i, name in enumerate(self.index.names)
Expand Down Expand Up @@ -3028,9 +2953,7 @@ def rename(
"mixed type is not yet supported."
)

if level is not None and isinstance(
self.index, cudf.core.multiindex.MultiIndex
):
if level is not None and isinstance(self.index, MultiIndex):
out_index = self.index.copy(deep=copy)
out_index.get_level_values(level).to_frame().replace(
to_replace=list(index.keys()),
Expand Down Expand Up @@ -3307,7 +3230,7 @@ def agg(self, aggs, axis=None):
raise NotImplementedError("axis not implemented yet")

if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
result = cudf.DataFrame()
result = DataFrame()
# TODO : Allow simultaneous pass for multi-aggregation as
# a future optimization
for agg in aggs:
Expand All @@ -3320,7 +3243,7 @@ def agg(self, aggs, axis=None):
f"{aggs} is not a valid function for "
f"'DataFrame' object"
)
result = cudf.DataFrame()
result = DataFrame()
result[aggs] = getattr(df_normalized, aggs)()
result = result.iloc[:, 0]
result.name = None
Expand Down Expand Up @@ -3355,7 +3278,7 @@ def agg(self, aggs, axis=None):
raise NotImplementedError(
"callable parameter is not implemented yet"
)
result = cudf.DataFrame(index=idxs, columns=cols)
result = DataFrame(index=idxs, columns=cols)
for key in aggs.keys():
col = df_normalized[key]
col_empty = column_empty(
Expand Down Expand Up @@ -4758,7 +4681,7 @@ def to_pandas(self, nullable=False, **kwargs):

if isinstance(self.columns, BaseIndex):
out_columns = self.columns.to_pandas()
if isinstance(self.columns, cudf.core.multiindex.MultiIndex):
if isinstance(self.columns, MultiIndex):
if self.columns.names is not None:
out_columns.names = self.columns.names
else:
Expand Down Expand Up @@ -4934,7 +4857,7 @@ def to_arrow(self, preserve_index=True):
"step": 1,
}
else:
if isinstance(self.index, cudf.MultiIndex):
if isinstance(self.index, MultiIndex):
gen_names = tuple(
f"level_{i}"
for i, _ in enumerate(self.index._data.names)
Expand Down Expand Up @@ -5462,7 +5385,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
warnings.warn(msg)

if not skipna and any(col.nullable for col in filtered._columns):
mask = cudf.DataFrame(
mask = DataFrame(
{
name: filtered._data[name]._get_mask_as_column()
if filtered._data[name].nullable
Expand Down Expand Up @@ -6010,11 +5933,11 @@ def stack(self, level=-1, dropna=True):
repeated_index = self.index.repeat(self.shape[1])
name_index = Frame({0: self._column_names}).tile(self.shape[0])
new_index = list(repeated_index._columns) + [name_index._columns[0]]
if isinstance(self._index, cudf.MultiIndex):
if isinstance(self._index, MultiIndex):
index_names = self._index.names + [None]
else:
index_names = [None] * len(new_index)
new_index = cudf.core.multiindex.MultiIndex.from_frame(
new_index = MultiIndex.from_frame(
DataFrame(dict(zip(range(0, len(new_index)), new_index))),
names=index_names,
)
Expand Down Expand Up @@ -6275,8 +6198,8 @@ def append(
elif isinstance(other, list):
if not other:
pass
elif not isinstance(other[0], cudf.DataFrame):
other = cudf.DataFrame(other)
elif not isinstance(other[0], DataFrame):
other = DataFrame(other)
if (self.columns.get_indexer(other.columns) >= 0).all():
other = other.reindex(columns=self.columns)

Expand Down Expand Up @@ -6574,7 +6497,7 @@ def from_pandas(obj, nan_as_null=None):
elif isinstance(obj, pd.Series):
return Series.from_pandas(obj, nan_as_null=nan_as_null)
elif isinstance(obj, pd.MultiIndex):
return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
elif isinstance(obj, pd.RangeIndex):
return cudf.core.index.RangeIndex(
start=obj.start, stop=obj.stop, step=obj.step, name=obj.name
Expand Down Expand Up @@ -6692,7 +6615,7 @@ def extract_col(df, col):
if (
col == "index"
and col not in df.index._data
and not isinstance(df.index, cudf.MultiIndex)
and not isinstance(df.index, MultiIndex)
):
return df.index._data.columns[0]
return df.index._data[col]
Expand Down
Loading

0 comments on commit 7233765

Please sign in to comment.