Remove various unused functions (#9922)

This PR removes a number of unused functions and inlines some helpers that are only called in one place. This PR also deprecates `Series.fill`, which does not appear to be a pandas API. This PR resolves #9824. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Bradley Dice (https://github.com/bdice) URL: #9922
rapidsai · Jan 3, 2022 · 7233765 · 7233765
1 parent 67c925c
commit 7233765
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 142 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -59,6 +59,7 @@
     _get_label_range_or_mask,
     _indices_from_labels,
 )
+from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
@@ -90,8 +91,6 @@
 
 class _DataFrameIndexer(_FrameIndexer):
     def __getitem__(self, arg):
-        from cudf import MultiIndex
-
         if isinstance(self._frame.index, MultiIndex) or isinstance(
             self._frame.columns, MultiIndex
         ):
@@ -118,8 +117,6 @@ def _can_downcast_to_series(self, df, arg):
         operation should be "downcasted" from a DataFrame to a
         Series
         """
-        from cudf.core.column import as_column
-
         if isinstance(df, cudf.Series):
             return False
         nrows, ncols = df.shape
@@ -201,11 +198,6 @@ def _getitem_scalar(self, arg):
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
-        from cudf import MultiIndex
-        from cudf.core.column import column
-        from cudf.core.dataframe import DataFrame
-        from cudf.core.index import as_index
-
         # Step 1: Gather columns
         if isinstance(arg, tuple):
             columns_df = self._frame._get_columns_by_label(arg[1])
@@ -245,7 +237,7 @@ def _getitem_tuple_arg(self, arg):
                     tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                 if len(tmp_arg[0]) == 0:
                     return columns_df._empty_like(keep_index=True)
-                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])
+                tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1])
 
                 if is_bool_dtype(tmp_arg[0]):
                     df = columns_df._apply_boolean_mask(tmp_arg[0])
@@ -273,7 +265,7 @@ def _getitem_tuple_arg(self, arg):
                     start = self._frame.index[0]
                 df.index = as_index(start)
             else:
-                row_selection = column.as_column(arg[0])
+                row_selection = as_column(arg[0])
                 if is_bool_dtype(row_selection.dtype):
                     df.index = self._frame.index.take(row_selection)
                 else:
@@ -285,7 +277,7 @@ def _getitem_tuple_arg(self, arg):
 
     @annotate("LOC_SETITEM", color="blue", domain="cudf_python")
     def _setitem_tuple_arg(self, key, value):
-        if isinstance(self._frame.index, cudf.MultiIndex) or isinstance(
+        if isinstance(self._frame.index, MultiIndex) or isinstance(
             self._frame.columns, pd.MultiIndex
         ):
             raise NotImplementedError(
@@ -322,7 +314,7 @@ def _setitem_tuple_arg(self, key, value):
             self._frame._data.insert(key[1], new_col)
         else:
             if isinstance(value, (cupy.ndarray, np.ndarray)):
-                value_df = cudf.DataFrame(value)
+                value_df = DataFrame(value)
                 if value_df.shape[1] != columns_df.shape[1]:
                     if value_df.shape[1] == 1:
                         value_cols = (
@@ -351,13 +343,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
 
     @annotate("ILOC_GETITEM", color="blue", domain="cudf_python")
     def _getitem_tuple_arg(self, arg):
-        from cudf import MultiIndex
-        from cudf.core.column import column
-        from cudf.core.index import as_index
-
         # Iloc Step 1:
         # Gather the columns specified by the second tuple arg
-        columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1]))
+        columns_df = DataFrame(self._frame._get_columns_by_index(arg[1]))
 
         columns_df._index = self._frame._index
 
@@ -385,7 +373,7 @@ def _getitem_tuple_arg(self, arg):
                     index += len(columns_df)
                 df = columns_df._slice(slice(index, index + 1, 1))
             else:
-                arg = (column.as_column(arg[0]), arg[1])
+                arg = (as_column(arg[0]), arg[1])
                 if is_bool_dtype(arg[0]):
                     df = columns_df._apply_boolean_mask(arg[0])
                 else:
@@ -407,7 +395,7 @@ def _getitem_tuple_arg(self, arg):
 
     @annotate("ILOC_SETITEM", color="blue", domain="cudf_python")
     def _setitem_tuple_arg(self, key, value):
-        columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1]))
+        columns = DataFrame(self._frame._get_columns_by_index(key[1]))
 
         for col in columns:
             self._frame[col].iloc[key[0]] = value
@@ -953,6 +941,7 @@ def ndim(self):
         return 2
 
     def __dir__(self):
+        # Add the columns of the DataFrame to the dir output.
         o = set(dir(type(self)))
         o.update(self.__dict__)
         o.update(
@@ -1169,8 +1158,6 @@ def _slice(self: T, arg: slice) -> T:
         arg : should always be of type slice
 
         """
-        from cudf.core.index import RangeIndex
-
         num_rows = len(self)
         if num_rows == 0:
             return self
@@ -1284,8 +1271,6 @@ def memory_usage(self, index=True, deep=False):
         return Series(sizes, index=ind)
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        import cudf
-
         if method == "__call__" and hasattr(cudf, ufunc.__name__):
             func = getattr(cudf, ufunc.__name__)
             return func(self)
@@ -1329,6 +1314,7 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
+    # The _get_numeric_data method is necessary for dask compatibility.
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1554,9 +1540,9 @@ def _concat(
                 out._index._data,
                 indices[:first_data_column_position],
             )
-            if not isinstance(
-                out._index, cudf.MultiIndex
-            ) and is_categorical_dtype(out._index._values.dtype):
+            if not isinstance(out._index, MultiIndex) and is_categorical_dtype(
+                out._index._values.dtype
+            ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
                 )
@@ -1672,51 +1658,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
 
         return result
 
-    def _repr_pandas025_formatting(self, ncols, nrows, dtype=None):
-        """
-        With Pandas > 0.25 there are some new conditional formatting for some
-        datatypes and column/row configurations. This fixes most of them in
-        context to match the expected Pandas repr of the same content.
-
-        Examples
-        --------
-        >>> gdf.__repr__()
-            0   ...  19
-        0   46  ...  48
-        ..  ..  ...  ..
-        19  40  ...  29
-
-        [20 rows x 20 columns]
-
-        >>> nrows, ncols = _repr_pandas025_formatting(2, 2, dtype="category")
-        >>> pd.options.display.max_rows = nrows
-        >>> pd.options.display.max_columns = ncols
-        >>> gdf.__repr__()
-             0  ...  19
-        0   46  ...  48
-        ..  ..  ...  ..
-        19  40  ...  29
-
-        [20 rows x 20 columns]
-        """
-        ncols = 1 if ncols in [0, 2] and dtype == "datetime64[ns]" else ncols
-        ncols = (
-            1
-            if ncols == 0
-            and nrows == 1
-            and dtype in ["int8", "str", "category"]
-            else ncols
-        )
-        ncols = (
-            1
-            if nrows == 1
-            and dtype in ["int8", "int16", "int64", "str", "category"]
-            else ncols
-        )
-        ncols = 0 if ncols == 2 else ncols
-        ncols = 19 if ncols in [20, 21] else ncols
-        return ncols, nrows
-
     def _clean_renderable_dataframe(self, output):
         """
         This method takes in partial/preprocessed dataframe
@@ -1822,7 +1763,7 @@ def _get_renderable_dataframe(self):
                 # adjust right columns for output if multiindex.
                 right_cols = (
                     right_cols - 1
-                    if isinstance(self.index, cudf.MultiIndex)
+                    if isinstance(self.index, MultiIndex)
                     else right_cols
                 )
                 left_cols = int(ncols / 2.0) + 1
@@ -2151,20 +2092,6 @@ def columns(self, columns):
             data, multiindex=is_multiindex, level_names=columns.names,
         )
 
-    def _rename_columns(self, new_names):
-        old_cols = iter(self._data.names)
-        l_old_cols = len(self._data)
-        l_new_cols = len(new_names)
-        if l_new_cols != l_old_cols:
-            msg = (
-                f"Length of new column names: {l_new_cols} does not "
-                "match length of previous column names: {l_old_cols}"
-            )
-            raise ValueError(msg)
-
-        mapper = dict(zip(old_cols, new_names))
-        self.rename(mapper=mapper, inplace=True, axis=1)
-
     def _reindex(
         self, columns, dtypes=None, deep=False, index=None, inplace=False
     ):
@@ -2209,11 +2136,9 @@ def _reindex(
                 columns = (
                     columns if columns is not None else list(df._column_names)
                 )
-                df = cudf.DataFrame()
+                df = DataFrame()
             else:
-                df = cudf.DataFrame(None, index).join(
-                    df, how="left", sort=True
-                )
+                df = DataFrame(None, index).join(df, how="left", sort=True)
                 # double-argsort to map back from sorted to unsorted positions
                 df = df.take(index.argsort(ascending=True).argsort())
 
@@ -2445,7 +2370,7 @@ def set_index(
                     except TypeError:
                         msg = f"{col} cannot be converted to column-like."
                         raise TypeError(msg)
-                if isinstance(col, (cudf.MultiIndex, pd.MultiIndex)):
+                if isinstance(col, (MultiIndex, pd.MultiIndex)):
                     col = (
                         cudf.from_pandas(col)
                         if isinstance(col, pd.MultiIndex)
@@ -2473,7 +2398,7 @@ def set_index(
 
         if append:
             idx_cols = [self.index._data[x] for x in self.index._data]
-            if isinstance(self.index, cudf.MultiIndex):
+            if isinstance(self.index, MultiIndex):
                 idx_names = self.index.names
             else:
                 idx_names = [self.index.name]
@@ -2485,7 +2410,7 @@ def set_index(
         elif len(columns_to_add) == 1:
             idx = cudf.Index(columns_to_add[0], name=names[0])
         else:
-            idx = cudf.MultiIndex._from_data(
+            idx = MultiIndex._from_data(
                 {i: col for i, col in enumerate(columns_to_add)}
             )
             idx.names = names
@@ -2568,7 +2493,7 @@ class max_speed
         result = self if inplace else self.copy()
 
         if not drop:
-            if isinstance(self.index, cudf.MultiIndex):
+            if isinstance(self.index, MultiIndex):
                 names = tuple(
                     name if name is not None else f"level_{i}"
                     for i, name in enumerate(self.index.names)
@@ -3028,9 +2953,7 @@ def rename(
                     "mixed type is not yet supported."
                 )
 
-            if level is not None and isinstance(
-                self.index, cudf.core.multiindex.MultiIndex
-            ):
+            if level is not None and isinstance(self.index, MultiIndex):
                 out_index = self.index.copy(deep=copy)
                 out_index.get_level_values(level).to_frame().replace(
                     to_replace=list(index.keys()),
@@ -3307,7 +3230,7 @@ def agg(self, aggs, axis=None):
             raise NotImplementedError("axis not implemented yet")
 
         if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
-            result = cudf.DataFrame()
+            result = DataFrame()
             # TODO : Allow simultaneous pass for multi-aggregation as
             # a future optimization
             for agg in aggs:
@@ -3320,7 +3243,7 @@ def agg(self, aggs, axis=None):
                     f"{aggs} is not a valid function for "
                     f"'DataFrame' object"
                 )
-            result = cudf.DataFrame()
+            result = DataFrame()
             result[aggs] = getattr(df_normalized, aggs)()
             result = result.iloc[:, 0]
             result.name = None
@@ -3355,7 +3278,7 @@ def agg(self, aggs, axis=None):
                         raise NotImplementedError(
                             "callable parameter is not implemented yet"
                         )
-                result = cudf.DataFrame(index=idxs, columns=cols)
+                result = DataFrame(index=idxs, columns=cols)
                 for key in aggs.keys():
                     col = df_normalized[key]
                     col_empty = column_empty(
@@ -4758,7 +4681,7 @@ def to_pandas(self, nullable=False, **kwargs):
 
         if isinstance(self.columns, BaseIndex):
             out_columns = self.columns.to_pandas()
-            if isinstance(self.columns, cudf.core.multiindex.MultiIndex):
+            if isinstance(self.columns, MultiIndex):
                 if self.columns.names is not None:
                     out_columns.names = self.columns.names
             else:
@@ -4934,7 +4857,7 @@ def to_arrow(self, preserve_index=True):
                     "step": 1,
                 }
             else:
-                if isinstance(self.index, cudf.MultiIndex):
+                if isinstance(self.index, MultiIndex):
                     gen_names = tuple(
                         f"level_{i}"
                         for i, _ in enumerate(self.index._data.names)
@@ -5462,7 +5385,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
             warnings.warn(msg)
 
         if not skipna and any(col.nullable for col in filtered._columns):
-            mask = cudf.DataFrame(
+            mask = DataFrame(
                 {
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
@@ -6010,11 +5933,11 @@ def stack(self, level=-1, dropna=True):
         repeated_index = self.index.repeat(self.shape[1])
         name_index = Frame({0: self._column_names}).tile(self.shape[0])
         new_index = list(repeated_index._columns) + [name_index._columns[0]]
-        if isinstance(self._index, cudf.MultiIndex):
+        if isinstance(self._index, MultiIndex):
             index_names = self._index.names + [None]
         else:
             index_names = [None] * len(new_index)
-        new_index = cudf.core.multiindex.MultiIndex.from_frame(
+        new_index = MultiIndex.from_frame(
             DataFrame(dict(zip(range(0, len(new_index)), new_index))),
             names=index_names,
         )
@@ -6275,8 +6198,8 @@ def append(
         elif isinstance(other, list):
             if not other:
                 pass
-            elif not isinstance(other[0], cudf.DataFrame):
-                other = cudf.DataFrame(other)
+            elif not isinstance(other[0], DataFrame):
+                other = DataFrame(other)
                 if (self.columns.get_indexer(other.columns) >= 0).all():
                     other = other.reindex(columns=self.columns)
 
@@ -6574,7 +6497,7 @@ def from_pandas(obj, nan_as_null=None):
     elif isinstance(obj, pd.Series):
         return Series.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.MultiIndex):
-        return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
+        return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.RangeIndex):
         return cudf.core.index.RangeIndex(
             start=obj.start, stop=obj.stop, step=obj.step, name=obj.name
@@ -6692,7 +6615,7 @@ def extract_col(df, col):
         if (
             col == "index"
             and col not in df.index._data
-            and not isinstance(df.index, cudf.MultiIndex)
+            and not isinstance(df.index, MultiIndex)
         ):
             return df.index._data.columns[0]
         return df.index._data[col]