From 213446501a204f629ea27321222cb44077f5a9a9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 3 Dec 2021 14:30:37 -0800 Subject: [PATCH 01/11] Inline _repeat and _shift. --- python/cudf/cudf/core/frame.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c85ed0c8555..b9dd4d192a9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1798,14 +1798,11 @@ def repeat(self, repeats, axis=None): "Only axis=`None` supported at this time." ) - return self._repeat(repeats) - - def _repeat(self, count): - if not is_scalar(count): - count = as_column(count) + if not is_scalar(repeats): + repeats = as_column(repeats) result = self.__class__._from_data( - *libcudf.filling.repeat(self, count) + *libcudf.filling.repeat(self, repeats) ) result._copy_type_metadata(self) @@ -1827,11 +1824,15 @@ def _fill(self, fill_values, begin, end, inplace): def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" - assert axis in (None, 0) and freq is None - return self._shift(periods) + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: + raise ValueError("Only axis=0 is supported.") + if freq is not None: + raise ValueError("The freq argument is not yet supported.") - def _shift(self, offset, fill_value=None): - data_columns = (col.shift(offset, fill_value) for col in self._columns) + data_columns = ( + col.shift(periods, fill_value) for col in self._columns + ) return self.__class__._from_data( zip(self._column_names, data_columns), self._index ) From 450353416ccb5eabbeb9e43de3d5a7a993bca53a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 3 Dec 2021 14:36:33 -0800 Subject: [PATCH 02/11] Inline _fill and deprecate fill. --- python/cudf/cudf/core/frame.py | 14 -------------- python/cudf/cudf/core/series.py | 18 +++++++++++++++++- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b9dd4d192a9..c4ee00c60b7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1808,20 +1808,6 @@ def repeat(self, repeats, axis=None): result._copy_type_metadata(self) return result - def _fill(self, fill_values, begin, end, inplace): - col_and_fill = zip(self._columns, fill_values) - - if not inplace: - data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - return self.__class__._from_data( - zip(self._column_names, data_columns), self._index - ) - - for (c, v) in col_and_fill: - c.fill(v, begin, end, inplace=True) - - return self - def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" axis = self._get_axis_from_axis_arg(axis) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 036c8c1ee00..8bc757f33db 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1629,7 +1629,23 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) def fill(self, fill_value, begin=0, end=-1, inplace=False): - return self._fill([fill_value], begin, end, inplace) + warnings.warn( + "The fill method will be removed in a future cuDF " "release.", + FutureWarning, + ) + fill_values = [fill_value] + col_and_fill = zip(self._columns, fill_values) + + if not inplace: + data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) + + for (c, v) in col_and_fill: + c.fill(v, begin, end, inplace=True) + + return self def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None From 7f734973fc4014f8dcf5542c33dbeb4e45db8db2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 3 Dec 2021 14:48:20 -0800 Subject: [PATCH 03/11] Inline _split. --- python/cudf/cudf/core/frame.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c4ee00c60b7..7eb7aa0c3cd 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1024,6 +1024,11 @@ def _partition(self, scatter_map, npartitions, keep_index=True): result = partitioned._split(output_offsets, keep_index=keep_index) + splits = libcudf.copying.table_split( + self, output_offsets, keep_index=keep_index + ) + result = [self.__class__._from_data(*result) for result in splits] + for frame in result: frame._copy_type_metadata(self, include_index=keep_index) @@ -3733,12 +3738,6 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) - def _split(self, splits, keep_index=True): - results = libcudf.copying.table_split( - self, splits, keep_index=keep_index - ) - return [self.__class__._from_data(*result) for result in results] - def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): From 4185253606e7732253258ff0b0beb63f8607b4f7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 09:29:06 -0800 Subject: [PATCH 04/11] Remove _repr_pandas025_formatting. --- python/cudf/cudf/core/dataframe.py | 45 ----------------------------- python/cudf/cudf/tests/test_repr.py | 6 ---- 2 files changed, 51 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 88c8aaebd9e..16ccbfdeabf 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1672,51 +1672,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): return result - def _repr_pandas025_formatting(self, ncols, nrows, dtype=None): - """ - With Pandas > 0.25 there are some new conditional formatting for some - datatypes and column/row configurations. This fixes most of them in - context to match the expected Pandas repr of the same content. - - Examples - -------- - >>> gdf.__repr__() - 0 ... 19 - 0 46 ... 48 - .. .. ... .. - 19 40 ... 29 - - [20 rows x 20 columns] - - >>> nrows, ncols = _repr_pandas025_formatting(2, 2, dtype="category") - >>> pd.options.display.max_rows = nrows - >>> pd.options.display.max_columns = ncols - >>> gdf.__repr__() - 0 ... 19 - 0 46 ... 48 - .. .. ... .. - 19 40 ... 29 - - [20 rows x 20 columns] - """ - ncols = 1 if ncols in [0, 2] and dtype == "datetime64[ns]" else ncols - ncols = ( - 1 - if ncols == 0 - and nrows == 1 - and dtype in ["int8", "str", "category"] - else ncols - ) - ncols = ( - 1 - if nrows == 1 - and dtype in ["int8", "int16", "int64", "str", "category"] - else ncols - ) - ncols = 0 if ncols == 2 else ncols - ncols = 19 if ncols in [20, 21] else ncols - return ncols, nrows - def _clean_renderable_dataframe(self, output): """ This method takes in partial/preprocessed dataframe diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index fe95b2930df..f8c136b8c2d 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -98,15 +98,9 @@ def test_full_dataframe_20(dtype, nrows, ncols): ).astype(dtype) gdf = cudf.from_pandas(pdf) - ncols, nrows = gdf._repr_pandas025_formatting(ncols, nrows, dtype) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() assert pdf._repr_html_() == gdf._repr_html_() assert pdf._repr_latex_() == gdf._repr_latex_() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") @pytest.mark.parametrize("dtype", repr_categories) From 74d4a1079140400a94886d4145853229722a7d5a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 09:45:22 -0800 Subject: [PATCH 05/11] Remove unnecessary local imports. --- python/cudf/cudf/core/dataframe.py | 65 +++++++++++------------------- 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16ccbfdeabf..4696804fe01 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -59,6 +59,7 @@ _get_label_range_or_mask, _indices_from_labels, ) +from cudf.core.multiindex import MultiIndex from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.utils import applyutils, docutils, ioutils, queryutils, utils @@ -90,10 +91,8 @@ class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - from cudf import MultiIndex - - if isinstance(self._frame.index, MultiIndex) or isinstance( - self._frame.columns, MultiIndex + if isinstance(self._frame.index, cudf.MultiIndex) or isinstance( + self._frame.columns, cudf.MultiIndex ): # This try/except block allows the use of pandas-like # tuple arguments into MultiIndex dataframes. @@ -118,8 +117,6 @@ def _can_downcast_to_series(self, df, arg): operation should be "downcasted" from a DataFrame to a Series """ - from cudf.core.column import as_column - if isinstance(df, cudf.Series): return False nrows, ncols = df.shape @@ -201,11 +198,6 @@ def _getitem_scalar(self, arg): def _getitem_tuple_arg(self, arg): from uuid import uuid4 - from cudf import MultiIndex - from cudf.core.column import column - from cudf.core.dataframe import DataFrame - from cudf.core.index import as_index - # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._frame._get_columns_by_label(arg[1]) @@ -245,13 +237,13 @@ def _getitem_tuple_arg(self, arg): tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) - tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) + tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1]) if is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) - other_df = DataFrame( + other_df = cudf.DataFrame( {tmp_col_name: column.arange(len(tmp_arg[0]))}, index=as_index(tmp_arg[0]), ) @@ -273,7 +265,7 @@ def _getitem_tuple_arg(self, arg): start = self._frame.index[0] df.index = as_index(start) else: - row_selection = column.as_column(arg[0]) + row_selection = as_column(arg[0]) if is_bool_dtype(row_selection.dtype): df.index = self._frame.index.take(row_selection) else: @@ -285,7 +277,7 @@ def _getitem_tuple_arg(self, arg): @annotate("LOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - if isinstance(self._frame.index, cudf.MultiIndex) or isinstance( + if isinstance(self._frame.index, MultiIndex) or isinstance( self._frame.columns, pd.MultiIndex ): raise NotImplementedError( @@ -351,10 +343,6 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): @annotate("ILOC_GETITEM", color="blue", domain="cudf_python") def _getitem_tuple_arg(self, arg): - from cudf import MultiIndex - from cudf.core.column import column - from cudf.core.index import as_index - # Iloc Step 1: # Gather the columns specified by the second tuple arg columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1])) @@ -385,7 +373,7 @@ def _getitem_tuple_arg(self, arg): index += len(columns_df) df = columns_df._slice(slice(index, index + 1, 1)) else: - arg = (column.as_column(arg[0]), arg[1]) + arg = (as_column(arg[0]), arg[1]) if is_bool_dtype(arg[0]): df = columns_df._apply_boolean_mask(arg[0]) else: @@ -953,6 +941,7 @@ def ndim(self): return 2 def __dir__(self): + # Add the columns of the DataFrame to the dir output. o = set(dir(type(self))) o.update(self.__dict__) o.update( @@ -1169,8 +1158,6 @@ def _slice(self: T, arg: slice) -> T: arg : should always be of type slice """ - from cudf.core.index import RangeIndex - num_rows = len(self) if num_rows == 0: return self @@ -1284,8 +1271,6 @@ def memory_usage(self, index=True, deep=False): return Series(sizes, index=ind) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - import cudf - if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) return func(self) @@ -1554,9 +1539,9 @@ def _concat( out._index._data, indices[:first_data_column_position], ) - if not isinstance( - out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._values.dtype): + if not isinstance(out._index, MultiIndex) and is_categorical_dtype( + out._index._values.dtype + ): out = out.set_index( cudf.core.index.as_index(out.index._values) ) @@ -1777,7 +1762,7 @@ def _get_renderable_dataframe(self): # adjust right columns for output if multiindex. right_cols = ( right_cols - 1 - if isinstance(self.index, cudf.MultiIndex) + if isinstance(self.index, MultiIndex) else right_cols ) left_cols = int(ncols / 2.0) + 1 @@ -2400,7 +2385,7 @@ def set_index( except TypeError: msg = f"{col} cannot be converted to column-like." raise TypeError(msg) - if isinstance(col, (cudf.MultiIndex, pd.MultiIndex)): + if isinstance(col, (MultiIndex, pd.MultiIndex)): col = ( cudf.from_pandas(col) if isinstance(col, pd.MultiIndex) @@ -2428,7 +2413,7 @@ def set_index( if append: idx_cols = [self.index._data[x] for x in self.index._data] - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): idx_names = self.index.names else: idx_names = [self.index.name] @@ -2440,7 +2425,7 @@ def set_index( elif len(columns_to_add) == 1: idx = cudf.Index(columns_to_add[0], name=names[0]) else: - idx = cudf.MultiIndex._from_data( + idx = MultiIndex._from_data( {i: col for i, col in enumerate(columns_to_add)} ) idx.names = names @@ -2523,7 +2508,7 @@ class max_speed result = self if inplace else self.copy() if not drop: - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): names = tuple( name if name is not None else f"level_{i}" for i, name in enumerate(self.index.names) @@ -2983,9 +2968,7 @@ def rename( "mixed type is not yet supported." ) - if level is not None and isinstance( - self.index, cudf.core.multiindex.MultiIndex - ): + if level is not None and isinstance(self.index, MultiIndex): out_index = self.index.copy(deep=copy) out_index.get_level_values(level).to_frame().replace( to_replace=list(index.keys()), @@ -4713,7 +4696,7 @@ def to_pandas(self, nullable=False, **kwargs): if isinstance(self.columns, BaseIndex): out_columns = self.columns.to_pandas() - if isinstance(self.columns, cudf.core.multiindex.MultiIndex): + if isinstance(self.columns, MultiIndex): if self.columns.names is not None: out_columns.names = self.columns.names else: @@ -4889,7 +4872,7 @@ def to_arrow(self, preserve_index=True): "step": 1, } else: - if isinstance(self.index, cudf.MultiIndex): + if isinstance(self.index, MultiIndex): gen_names = tuple( f"level_{i}" for i, _ in enumerate(self.index._data.names) @@ -5965,11 +5948,11 @@ def stack(self, level=-1, dropna=True): repeated_index = self.index.repeat(self.shape[1]) name_index = Frame({0: self._column_names}).tile(self.shape[0]) new_index = list(repeated_index._columns) + [name_index._columns[0]] - if isinstance(self._index, cudf.MultiIndex): + if isinstance(self._index, MultiIndex): index_names = self._index.names + [None] else: index_names = [None] * len(new_index) - new_index = cudf.core.multiindex.MultiIndex.from_frame( + new_index = MultiIndex.from_frame( DataFrame(dict(zip(range(0, len(new_index)), new_index))), names=index_names, ) @@ -6529,7 +6512,7 @@ def from_pandas(obj, nan_as_null=None): elif isinstance(obj, pd.Series): return Series.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.MultiIndex): - return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) + return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.RangeIndex): return cudf.core.index.RangeIndex( start=obj.start, stop=obj.stop, step=obj.step, name=obj.name @@ -6647,7 +6630,7 @@ def extract_col(df, col): if ( col == "index" and col not in df.index._data - and not isinstance(df.index, cudf.MultiIndex) + and not isinstance(df.index, MultiIndex) ): return df.index._data.columns[0] return df.index._data[col] From de8272379088e7d3236e18a37a0badaec41b0a09 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 09:48:23 -0800 Subject: [PATCH 06/11] Remove _rename_columns. --- python/cudf/cudf/core/dataframe.py | 23 ----------------------- python/cudf/cudf/tests/test_dataframe.py | 9 --------- 2 files changed, 32 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4696804fe01..0f1a56752bd 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1314,15 +1314,6 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - def _get_numeric_data(self): - """Return a dataframe with only numeric data types""" - columns = [ - c - for c, dt in self.dtypes.items() - if dt != object and not is_categorical_dtype(dt) - ] - return self[columns] - def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. @@ -2091,20 +2082,6 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) - def _rename_columns(self, new_names): - old_cols = iter(self._data.names) - l_old_cols = len(self._data) - l_new_cols = len(new_names) - if l_new_cols != l_old_cols: - msg = ( - f"Length of new column names: {l_new_cols} does not " - "match length of previous column names: {l_old_cols}" - ) - raise ValueError(msg) - - mapper = dict(zip(old_cols, new_names)) - self.rename(mapper=mapper, inplace=True, axis=1) - def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ab0856fad1e..b70b1a657d1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3377,15 +3377,6 @@ def test_dataframe_describe_percentiles(): assert_eq(pdf_results, gdf_results) -def test_get_numeric_data(): - pdf = pd.DataFrame( - {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} - ) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) - - @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) @pytest.mark.parametrize("data_empty", [False, True]) From 7921067364f7860ee7dfd894c6a1d12f70d761f5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 09:56:43 -0800 Subject: [PATCH 07/11] Fix typo. --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8bc757f33db..18b2a6cf91e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1630,7 +1630,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): def fill(self, fill_value, begin=0, end=-1, inplace=False): warnings.warn( - "The fill method will be removed in a future cuDF " "release.", + "The fill method will be removed in a future cuDF release.", FutureWarning, ) fill_values = [fill_value] From 794df593902179ab3fae9c50ca0227f38c4fb367 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 13:11:21 -0800 Subject: [PATCH 08/11] Standardize all DataFrame references. --- python/cudf/cudf/core/dataframe.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0f1a56752bd..c91be2a13ca 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -243,7 +243,7 @@ def _getitem_tuple_arg(self, arg): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) - other_df = cudf.DataFrame( + other_df = DataFrame( {tmp_col_name: column.arange(len(tmp_arg[0]))}, index=as_index(tmp_arg[0]), ) @@ -314,7 +314,7 @@ def _setitem_tuple_arg(self, key, value): self._frame._data.insert(key[1], new_col) else: if isinstance(value, (cupy.ndarray, np.ndarray)): - value_df = cudf.DataFrame(value) + value_df = DataFrame(value) if value_df.shape[1] != columns_df.shape[1]: if value_df.shape[1] == 1: value_cols = ( @@ -345,7 +345,7 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): def _getitem_tuple_arg(self, arg): # Iloc Step 1: # Gather the columns specified by the second tuple arg - columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1])) + columns_df = DataFrame(self._frame._get_columns_by_index(arg[1])) columns_df._index = self._frame._index @@ -395,7 +395,7 @@ def _getitem_tuple_arg(self, arg): @annotate("ILOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1])) + columns = DataFrame(self._frame._get_columns_by_index(key[1])) for col in columns: self._frame[col].iloc[key[0]] = value @@ -2126,11 +2126,9 @@ def _reindex( columns = ( columns if columns is not None else list(df._column_names) ) - df = cudf.DataFrame() + df = DataFrame() else: - df = cudf.DataFrame(None, index).join( - df, how="left", sort=True - ) + df = DataFrame(None, index).join(df, how="left", sort=True) # double-argsort to map back from sorted to unsorted positions df = df.take(index.argsort(ascending=True).argsort()) @@ -3222,7 +3220,7 @@ def agg(self, aggs, axis=None): raise NotImplementedError("axis not implemented yet") if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)): - result = cudf.DataFrame() + result = DataFrame() # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization for agg in aggs: @@ -3235,7 +3233,7 @@ def agg(self, aggs, axis=None): f"{aggs} is not a valid function for " f"'DataFrame' object" ) - result = cudf.DataFrame() + result = DataFrame() result[aggs] = getattr(df_normalized, aggs)() result = result.iloc[:, 0] result.name = None @@ -3270,7 +3268,7 @@ def agg(self, aggs, axis=None): raise NotImplementedError( "callable parameter is not implemented yet" ) - result = cudf.DataFrame(index=idxs, columns=cols) + result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = df_normalized[key] col_empty = column_empty( @@ -5377,7 +5375,7 @@ def _prepare_for_rowwise_op(self, method, skipna): warnings.warn(msg) if not skipna and any(col.nullable for col in filtered._columns): - mask = cudf.DataFrame( + mask = DataFrame( { name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable @@ -6190,8 +6188,8 @@ def append( elif isinstance(other, list): if not other: pass - elif not isinstance(other[0], cudf.DataFrame): - other = cudf.DataFrame(other) + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) if (self.columns.get_indexer(other.columns) >= 0).all(): other = other.reindex(columns=self.columns) From 1c8e0194d0448e7e65ac624ca7d1f0f2315f59f7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 13:23:01 -0800 Subject: [PATCH 09/11] Fix fill test. --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/tests/test_fill.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c91be2a13ca..3c467ac2cd6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -91,8 +91,8 @@ class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - if isinstance(self._frame.index, cudf.MultiIndex) or isinstance( - self._frame.columns, cudf.MultiIndex + if isinstance(self._frame.index, MultiIndex) or isinstance( + self._frame.columns, MultiIndex ): # This try/except block allows the use of pandas-like # tuple arguments into MultiIndex dataframes. diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py index efbe2834486..224db2b39d1 100644 --- a/python/cudf/cudf/tests/test_fill.py +++ b/python/cudf/cudf/tests/test_fill.py @@ -50,7 +50,7 @@ def test_fill(data, fill_value, begin, end, inplace): begin = max(0, min(len(gs), begin)) end = max(0, min(len(gs), end)) - actual = gs._fill([fill_value], begin, end, False) + actual = gs.fill(fill_value, begin, end, False) assert actual is not gs ps[begin:end] = fill_value From c1b608bc2d0cc7ace266f7c54acf4ba7cc03dddf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 16 Dec 2021 14:13:23 -0800 Subject: [PATCH 10/11] Revert "Inline _split." This reverts commit 7f734973fc4014f8dcf5542c33dbeb4e45db8db2. --- python/cudf/cudf/core/frame.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7eb7aa0c3cd..c4ee00c60b7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1024,11 +1024,6 @@ def _partition(self, scatter_map, npartitions, keep_index=True): result = partitioned._split(output_offsets, keep_index=keep_index) - splits = libcudf.copying.table_split( - self, output_offsets, keep_index=keep_index - ) - result = [self.__class__._from_data(*result) for result in splits] - for frame in result: frame._copy_type_metadata(self, include_index=keep_index) @@ -3738,6 +3733,12 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) + def _split(self, splits, keep_index=True): + results = libcudf.copying.table_split( + self, splits, keep_index=keep_index + ) + return [self.__class__._from_data(*result) for result in results] + def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): From 63e3896bf95c68597127d4979efe9f77449110d1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 21 Dec 2021 13:49:46 -0800 Subject: [PATCH 11/11] Reintroduce _get_nuemric_data for dask. --- python/cudf/cudf/core/dataframe.py | 10 ++++++++++ python/cudf/cudf/tests/test_dataframe.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3c467ac2cd6..b7fc5efb412 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1314,6 +1314,16 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented + # The _get_numeric_data method is necessary for dask compatibility. + def _get_numeric_data(self): + """Return a dataframe with only numeric data types""" + columns = [ + c + for c, dt in self.dtypes.items() + if dt != object and not is_categorical_dtype(dt) + ] + return self[columns] + def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b70b1a657d1..ab0856fad1e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3377,6 +3377,15 @@ def test_dataframe_describe_percentiles(): assert_eq(pdf_results, gdf_results) +def test_get_numeric_data(): + pdf = pd.DataFrame( + {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} + ) + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) + + @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) @pytest.mark.parametrize("data_empty", [False, True])