From 723376576b2f36711b12dd434b95e8aeac99f653 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jan 2022 12:38:41 -0600 Subject: [PATCH] Remove various unused functions (#9922) This PR removes a number of unused functions and inlines some helpers that are only called in one place. This PR also deprecates `Series.fill`, which does not appear to be a pandas API. This PR resolves #9824. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9922 --- python/cudf/cudf/core/dataframe.py | 143 +++++++--------------------- python/cudf/cudf/core/frame.py | 35 +++---- python/cudf/cudf/core/series.py | 18 +++- python/cudf/cudf/tests/test_fill.py | 2 +- python/cudf/cudf/tests/test_repr.py | 6 -- 5 files changed, 62 insertions(+), 142 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 88c8aaebd9e..b7fc5efb412 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -59,6 +59,7 @@ _get_label_range_or_mask, _indices_from_labels, ) +from cudf.core.multiindex import MultiIndex from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.utils import applyutils, docutils, ioutils, queryutils, utils @@ -90,8 +91,6 @@ class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - from cudf import MultiIndex - if isinstance(self._frame.index, MultiIndex) or isinstance( self._frame.columns, MultiIndex ): @@ -118,8 +117,6 @@ def _can_downcast_to_series(self, df, arg): operation should be "downcasted" from a DataFrame to a Series """ - from cudf.core.column import as_column - if isinstance(df, cudf.Series): return False nrows, ncols = df.shape @@ -201,11 +198,6 @@ def _getitem_scalar(self, arg): def _getitem_tuple_arg(self, arg): from uuid import uuid4 - from cudf import MultiIndex - from cudf.core.column import column - from cudf.core.dataframe import DataFrame - from cudf.core.index import as_index - # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._frame._get_columns_by_label(arg[1]) @@ -245,7 +237,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) - tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) + tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1]) if is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) @@ -273,7 +265,7 @@ def _getitem_tuple_arg(self, arg): start = self._frame.index[0] df.index = as_index(start) else: - row_selection = column.as_column(arg[0]) + row_selection = as_column(arg[0]) if is_bool_dtype(row_selection.dtype): df.index = self._frame.index.take(row_selection) else: @@ -285,7 +277,7 @@ def _getitem_tuple_arg(self, arg): @annotate("LOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - if isinstance(self._frame.index, cudf.MultiIndex) or isinstance( + if isinstance(self._frame.index, MultiIndex) or isinstance( self._frame.columns, pd.MultiIndex ): raise NotImplementedError( @@ -322,7 +314,7 @@ def _setitem_tuple_arg(self, key, value): self._frame._data.insert(key[1], new_col) else: if isinstance(value, (cupy.ndarray, np.ndarray)): - value_df = cudf.DataFrame(value) + value_df = DataFrame(value) if value_df.shape[1] != columns_df.shape[1]: if value_df.shape[1] == 1: value_cols = ( @@ -351,13 +343,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): @annotate("ILOC_GETITEM", color="blue", domain="cudf_python") def _getitem_tuple_arg(self, arg): - from cudf import MultiIndex - from cudf.core.column import column - from cudf.core.index import as_index - # Iloc Step 1: # Gather the columns specified by the second tuple arg - columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1])) + columns_df = DataFrame(self._frame._get_columns_by_index(arg[1])) columns_df._index = self._frame._index @@ -385,7 +373,7 @@ def _getitem_tuple_arg(self, arg): index += len(columns_df) df = columns_df._slice(slice(index, index + 1, 1)) else: - arg = (column.as_column(arg[0]), arg[1]) + arg = (as_column(arg[0]), arg[1]) if is_bool_dtype(arg[0]): df = columns_df._apply_boolean_mask(arg[0]) else: @@ -407,7 +395,7 @@ def _getitem_tuple_arg(self, arg): @annotate("ILOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1])) + columns = DataFrame(self._frame._get_columns_by_index(key[1])) for col in columns: self._frame[col].iloc[key[0]] = value @@ -953,6 +941,7 @@ def ndim(self): return 2 def __dir__(self): + # Add the columns of the DataFrame to the dir output. o = set(dir(type(self))) o.update(self.__dict__) o.update( @@ -1169,8 +1158,6 @@ def _slice(self: T, arg: slice) -> T: arg : should always be of type slice """ - from cudf.core.index import RangeIndex - num_rows = len(self) if num_rows == 0: return self @@ -1284,8 +1271,6 @@ def memory_usage(self, index=True, deep=False): return Series(sizes, index=ind) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - import cudf - if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) return func(self) @@ -1329,6 +1314,7 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented + # The _get_numeric_data method is necessary for dask compatibility. def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1554,9 +1540,9 @@ def _concat( out._index._data, indices[:first_data_column_position], ) - if not isinstance( - out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._values.dtype): + if not isinstance(out._index, MultiIndex) and is_categorical_dtype( + out._index._values.dtype + ): out = out.set_index( cudf.core.index.as_index(out.index._values) ) @@ -1672,51 +1658,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): return result - def _repr_pandas025_formatting(self, ncols, nrows, dtype=None): - """ - With Pandas > 0.25 there are some new conditional formatting for some - datatypes and column/row configurations. This fixes most of them in - context to match the expected Pandas repr of the same content. - - Examples - -------- - >>> gdf.__repr__() - 0 ... 19 - 0 46 ... 48 - .. .. ... .. - 19 40 ... 29 - - [20 rows x 20 columns] - - >>> nrows, ncols = _repr_pandas025_formatting(2, 2, dtype="category") - >>> pd.options.display.max_rows = nrows - >>> pd.options.display.max_columns = ncols - >>> gdf.__repr__() - 0 ... 19 - 0 46 ... 48 - .. .. ... .. - 19 40 ... 29 - - [20 rows x 20 columns] - """ - ncols = 1 if ncols in [0, 2] and dtype == "datetime64[ns]" else ncols - ncols = ( - 1 - if ncols == 0 - and nrows == 1 - and dtype in ["int8", "str", "category"] - else ncols - ) - ncols = ( - 1 - if nrows == 1 - and dtype in ["int8", "int16", "int64", "str", "category"] - else ncols - ) - ncols = 0 if ncols == 2 else ncols - ncols = 19 if ncols in [20, 21] else ncols - return ncols, nrows - def _clean_renderable_dataframe(self, output): """ This method takes in partial/preprocessed dataframe @@ -1822,7 +1763,7 @@ def _get_renderable_dataframe(self): # adjust right columns for output if multiindex. right_cols = ( right_cols - 1 - if isinstance(self.index, cudf.MultiIndex) + if isinstance(self.index, MultiIndex) else right_cols ) left_cols = int(ncols / 2.0) + 1 @@ -2151,20 +2092,6 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) - def _rename_columns(self, new_names): - old_cols = iter(self._data.names) - l_old_cols = len(self._data) - l_new_cols = len(new_names) - if l_new_cols != l_old_cols: - msg = ( - f"Length of new column names: {l_new_cols} does not " - "match length of previous column names: {l_old_cols}" - ) - raise ValueError(msg) - - mapper = dict(zip(old_cols, new_names)) - self.rename(mapper=mapper, inplace=True, axis=1) - def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2209,11 +2136,9 @@ def _reindex( columns = ( columns if columns is not None else list(df._column_names) ) - df = cudf.DataFrame() + df = DataFrame() else: - df = cudf.DataFrame(None, index).join( - df, how="left", sort=True - ) + df = DataFrame(None, index).join(df, how="left", sort=True) # double-argsort to map back from sorted to unsorted positions df = df.take(index.argsort(ascending=True).argsort()) @@ -2445,7 +2370,7 @@ def set_index( except TypeError: msg = f"{col} cannot be converted to column-like." raise TypeError(msg) - if isinstance(col, (cudf.MultiIndex, pd.MultiIndex)): + if isinstance(col, (MultiIndex, pd.MultiIndex)): col = ( cudf.from_pandas(col) if isinstance(col, pd.MultiIndex) @@ -2473,7 +2398,7 @@ def set_index( if append: idx_cols = [self.index._data[x] for x in self.index._data] - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): idx_names = self.index.names else: idx_names = [self.index.name] @@ -2485,7 +2410,7 @@ def set_index( elif len(columns_to_add) == 1: idx = cudf.Index(columns_to_add[0], name=names[0]) else: - idx = cudf.MultiIndex._from_data( + idx = MultiIndex._from_data( {i: col for i, col in enumerate(columns_to_add)} ) idx.names = names @@ -2568,7 +2493,7 @@ class max_speed result = self if inplace else self.copy() if not drop: - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): names = tuple( name if name is not None else f"level_{i}" for i, name in enumerate(self.index.names) @@ -3028,9 +2953,7 @@ def rename( "mixed type is not yet supported." ) - if level is not None and isinstance( - self.index, cudf.core.multiindex.MultiIndex - ): + if level is not None and isinstance(self.index, MultiIndex): out_index = self.index.copy(deep=copy) out_index.get_level_values(level).to_frame().replace( to_replace=list(index.keys()), @@ -3307,7 +3230,7 @@ def agg(self, aggs, axis=None): raise NotImplementedError("axis not implemented yet") if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)): - result = cudf.DataFrame() + result = DataFrame() # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization for agg in aggs: @@ -3320,7 +3243,7 @@ def agg(self, aggs, axis=None): f"{aggs} is not a valid function for " f"'DataFrame' object" ) - result = cudf.DataFrame() + result = DataFrame() result[aggs] = getattr(df_normalized, aggs)() result = result.iloc[:, 0] result.name = None @@ -3355,7 +3278,7 @@ def agg(self, aggs, axis=None): raise NotImplementedError( "callable parameter is not implemented yet" ) - result = cudf.DataFrame(index=idxs, columns=cols) + result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = df_normalized[key] col_empty = column_empty( @@ -4758,7 +4681,7 @@ def to_pandas(self, nullable=False, **kwargs): if isinstance(self.columns, BaseIndex): out_columns = self.columns.to_pandas() - if isinstance(self.columns, cudf.core.multiindex.MultiIndex): + if isinstance(self.columns, MultiIndex): if self.columns.names is not None: out_columns.names = self.columns.names else: @@ -4934,7 +4857,7 @@ def to_arrow(self, preserve_index=True): "step": 1, } else: - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): gen_names = tuple( f"level_{i}" for i, _ in enumerate(self.index._data.names) @@ -5462,7 +5385,7 @@ def _prepare_for_rowwise_op(self, method, skipna): warnings.warn(msg) if not skipna and any(col.nullable for col in filtered._columns): - mask = cudf.DataFrame( + mask = DataFrame( { name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable @@ -6010,11 +5933,11 @@ def stack(self, level=-1, dropna=True): repeated_index = self.index.repeat(self.shape[1]) name_index = Frame({0: self._column_names}).tile(self.shape[0]) new_index = list(repeated_index._columns) + [name_index._columns[0]] - if isinstance(self._index, cudf.MultiIndex): + if isinstance(self._index, MultiIndex): index_names = self._index.names + [None] else: index_names = [None] * len(new_index) - new_index = cudf.core.multiindex.MultiIndex.from_frame( + new_index = MultiIndex.from_frame( DataFrame(dict(zip(range(0, len(new_index)), new_index))), names=index_names, ) @@ -6275,8 +6198,8 @@ def append( elif isinstance(other, list): if not other: pass - elif not isinstance(other[0], cudf.DataFrame): - other = cudf.DataFrame(other) + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) if (self.columns.get_indexer(other.columns) >= 0).all(): other = other.reindex(columns=self.columns) @@ -6574,7 +6497,7 @@ def from_pandas(obj, nan_as_null=None): elif isinstance(obj, pd.Series): return Series.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.MultiIndex): - return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) + return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.RangeIndex): return cudf.core.index.RangeIndex( start=obj.start, stop=obj.stop, step=obj.step, name=obj.name @@ -6692,7 +6615,7 @@ def extract_col(df, col): if ( col == "index" and col not in df.index._data - and not isinstance(df.index, cudf.MultiIndex) + and not isinstance(df.index, MultiIndex) ): return df.index._data.columns[0] return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c83b06707a4..bae15c5e9fd 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1798,40 +1798,27 @@ def repeat(self, repeats, axis=None): "Only axis=`None` supported at this time." ) - return self._repeat(repeats) - - def _repeat(self, count): - if not is_scalar(count): - count = as_column(count) + if not is_scalar(repeats): + repeats = as_column(repeats) result = self.__class__._from_data( - *libcudf.filling.repeat(self, count) + *libcudf.filling.repeat(self, repeats) ) result._copy_type_metadata(self) return result - def _fill(self, fill_values, begin, end, inplace): - col_and_fill = zip(self._columns, fill_values) - - if not inplace: - data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - return self.__class__._from_data( - zip(self._column_names, data_columns), self._index - ) - - for (c, v) in col_and_fill: - c.fill(v, begin, end, inplace=True) - - return self - def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" - assert axis in (None, 0) and freq is None - return self._shift(periods) + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: + raise ValueError("Only axis=0 is supported.") + if freq is not None: + raise ValueError("The freq argument is not yet supported.") - def _shift(self, offset, fill_value=None): - data_columns = (col.shift(offset, fill_value) for col in self._columns) + data_columns = ( + col.shift(periods, fill_value) for col in self._columns + ) return self.__class__._from_data( zip(self._column_names, data_columns), self._index ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ec7c3df076..fb86cf85c4c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1628,7 +1628,23 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) def fill(self, fill_value, begin=0, end=-1, inplace=False): - return self._fill([fill_value], begin, end, inplace) + warnings.warn( + "The fill method will be removed in a future cuDF release.", + FutureWarning, + ) + fill_values = [fill_value] + col_and_fill = zip(self._columns, fill_values) + + if not inplace: + data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) + + for (c, v) in col_and_fill: + c.fill(v, begin, end, inplace=True) + + return self def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py index efbe2834486..224db2b39d1 100644 --- a/python/cudf/cudf/tests/test_fill.py +++ b/python/cudf/cudf/tests/test_fill.py @@ -50,7 +50,7 @@ def test_fill(data, fill_value, begin, end, inplace): begin = max(0, min(len(gs), begin)) end = max(0, min(len(gs), end)) - actual = gs._fill([fill_value], begin, end, False) + actual = gs.fill(fill_value, begin, end, False) assert actual is not gs ps[begin:end] = fill_value diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index fe95b2930df..f8c136b8c2d 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -98,15 +98,9 @@ def test_full_dataframe_20(dtype, nrows, ncols): ).astype(dtype) gdf = cudf.from_pandas(pdf) - ncols, nrows = gdf._repr_pandas025_formatting(ncols, nrows, dtype) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() assert pdf._repr_html_() == gdf._repr_html_() assert pdf._repr_latex_() == gdf._repr_latex_() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") @pytest.mark.parametrize("dtype", repr_categories)