diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9307267b227..e1b6cc45dd3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: self._data = ColumnAccessor( data=dict(zip(other.names, self._data.columns)), multiindex=other.multiindex, + rangeindex=other.rangeindex, level_names=other.level_names, label_dtype=other.label_dtype, verify=False, @@ -7534,7 +7535,7 @@ def _sample_axis_1( def _from_columns_like_self( self, columns: List[ColumnBase], - column_names: abc.Iterable[str], + column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, *, override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d898eb4b9c3..fdc78005996 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -40,8 +40,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_decimal_dtype, is_dict_like, is_list_like, is_scalar, @@ -372,7 +370,6 @@ def _mimic_inplace( self._index = result.index return super()._mimic_inplace(result, inplace) - # Scans @_cudf_nvtx_annotate def _scan(self, op, axis=None, skipna=True): """ @@ -417,8 +414,8 @@ def _scan(self, op, axis=None, skipna=True): cast_to_int = op in ("cumsum", "cumprod") skipna = True if skipna is None else skipna - results = {} - for name, col in self._data.items(): + results = [] + for col in self._columns: if skipna: result_col = col.nans_to_nulls() else: @@ -429,19 +426,14 @@ def _scan(self, op, axis=None, skipna=True): else: result_col = col - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): + if cast_to_int and result_col.dtype.kind in "uib": # For reductions that accumulate a value (e.g. sum, not max) # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) - results[name] = getattr(result_col, op)() - return self._from_data(results, self.index) + results.append(getattr(result_col, op)()) + return self._from_data_like_self( + self._data._from_columns_like_self(results) + ) def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the @@ -880,7 +872,6 @@ def replace( FutureWarning, ) if not (to_replace is None and value is no_default): - copy_data = {} ( all_na_per_column, to_replace_per_column, @@ -890,10 +881,10 @@ def replace( value=value, columns_dtype_map=dict(self._dtypes), ) - + copy_data = [] for name, col in self._data.items(): try: - copy_data[name] = col.find_and_replace( + replaced = col.find_and_replace( to_replace_per_column[name], replacements_per_column[name], all_na_per_column[name], @@ -906,11 +897,13 @@ def replace( # that exists in `copy_data`. # ii. There is an OverflowError while trying to cast # `to_replace_per_column` to `replacements_per_column`. - copy_data[name] = col.copy(deep=True) + replaced = col.copy(deep=True) + copy_data.append(replaced) + result = self._from_data_like_self( + self._data._from_columns_like_self(copy_data) + ) else: - copy_data = self._data.copy(deep=True) - - result = self._from_data(copy_data, self.index) + result = self.copy() return self._mimic_inplace(result, inplace=inplace) @@ -1031,12 +1024,13 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): ): lower[0], upper[0] = upper[0], lower[0] - data = { - name: col.clip(lower[i], upper[i]) - for i, (name, col) in enumerate(self._data.items()) - } - output = self._from_data(data, self.index) - output._copy_type_metadata(self, include_index=False) + data = ( + col.clip(low, high) + for col, low, high in zip(self._columns, lower, upper) + ) + output = self._from_data_like_self( + self._data._from_columns_like_self(data) + ) return self._mimic_inplace(output, inplace=inplace) @_cudf_nvtx_annotate @@ -1913,7 +1907,7 @@ def nans_to_nulls(self): 2 """ result = [] - for col in self._data.columns: + for col in self._columns: converted = col.nans_to_nulls() if converted is col: converted = converted.copy() @@ -2028,8 +2022,8 @@ def interpolate( ) interpolator = cudf.core.algorithms.get_column_interpolator(method) - columns = {} - for colname, col in data._data.items(): + columns = [] + for col in data._columns: if isinstance(col, cudf.core.column.StringColumn): warnings.warn( f"{type(self).__name__}.interpolate with object dtype is " @@ -2040,9 +2034,12 @@ def interpolate( col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - columns[colname] = interpolator(col, index=data.index) + columns.append(interpolator(col, index=data.index)) - result = self._from_data(columns, index=data.index) + result = self._from_data_like_self( + self._data._from_columns_like_self(columns) + ) + result.index = data.index return ( result @@ -2069,8 +2066,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): data_columns = ( col.shift(periods, fill_value) for col in self._columns ) - return self.__class__._from_data( - zip(self._column_names, data_columns), self.index + return self._from_data_like_self( + self._data._from_columns_like_self(data_columns) ) @_cudf_nvtx_annotate @@ -3011,8 +3008,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: self._column_names, None if has_range_index or not keep_index else self.index.names, ) - result._data.label_dtype = self._data.label_dtype - result._data.rangeindex = self._data.rangeindex if keep_index and has_range_index: result.index = self.index[start:stop] @@ -3561,11 +3556,6 @@ def sort_values( ), keep_index=not ignore_index, ) - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): - out.columns = self._data.to_pandas_index() return out def _n_largest_or_smallest( @@ -3659,14 +3649,12 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data( - data=result._data, index=result.index + out = self._from_data( + self._data._from_columns_like_self(result._columns) ) - result._data.multiindex = self._data.multiindex - result._data._level_names = self._data._level_names - result.index.names = self.index.names - - return result + out.index = result.index + out.index.names = self.index.names + return out @_cudf_nvtx_annotate def _reindex( @@ -3898,24 +3886,14 @@ def round(self, decimals=0, how="half_even"): "decimals must be an integer, a dict-like or a Series" ) - cols = { - name: col.round(decimals[name], how=how) - if ( - name in decimals - and _is_non_decimal_numeric_dtype(col.dtype) - and not is_bool_dtype(col.dtype) - ) + cols = ( + col.round(decimals[name], how=how) + if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) for name, col in self._data.items() - } - - return self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=self.index, + ) + return self._from_data_like_self( + self._data._from_columns_like_self(cols) ) def resample( @@ -6238,6 +6216,8 @@ def rank( f"axis={axis} is not yet supported in rank" ) + num_cols = self._num_columns + dropped_cols = False source = self if numeric_only: if isinstance( @@ -6255,15 +6235,28 @@ def rank( source = self._get_columns_by_label(numeric_cols) if source.empty: return source.astype("float64") + elif source._num_columns != num_cols: + dropped_cols = True result_columns = libcudf.sort.rank_columns( [*source._columns], method_enum, na_option, ascending, pct ) - return self.__class__._from_data( - dict(zip(source._column_names, result_columns)), - index=source.index, - ).astype(np.float64) + if dropped_cols: + result = type(source)._from_data( + ColumnAccessor( + dict(zip(source._column_names, result_columns)), + multiindex=self._data.multiindex, + level_names=self._data.level_names, + label_dtype=self._data.label_dtype, + ), + ) + else: + result = source._from_data_like_self( + self._data._from_columns_like_self(result_columns) + ) + result.index = source.index + return result.astype(np.float64) def convert_dtypes( self, diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2037b1682db..7d140a1ffa5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION -import itertools - import numba import pandas as pd from pandas.api.indexers import BaseIndexer @@ -251,27 +249,13 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg_dataframe(self, df, agg_name): - return cudf.DataFrame._from_data( - { - col_name: self._apply_agg_column(col, agg_name) - for col_name, col in df._data.items() - }, - index=df.index, - ) - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return cudf.Series._from_data( - { - self.obj.name: self._apply_agg_column( - self.obj._column, agg_name - ) - }, - index=self.obj.index, - ) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) def _reduce( self, @@ -533,18 +517,9 @@ def _window_to_window_sizes(self, window): ) def _apply_agg(self, agg_name): - index = cudf.MultiIndex.from_frame( - cudf.DataFrame( - { - key: value - for key, value in itertools.chain( - self._group_keys._data.items(), - self.obj.index._data.items(), - ) - } - ) + index = cudf.MultiIndex._from_data( + {**self._group_keys._data, **self.obj.index._data} ) - result = super()._apply_agg(agg_name) result.index = index return result diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d76d5eb8065..98e9f9881c7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10980,7 +10980,7 @@ def test_squeeze(axis, data): assert_eq(result, expected) -@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)]) +@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) @pytest.mark.parametrize( "operation", [ @@ -10991,6 +10991,16 @@ def test_squeeze(axis, data): lambda df: abs(df), lambda df: -df, lambda df: ~df, + lambda df: df.cumsum(), + lambda df: df.replace(1, 2), + lambda df: df.replace(10, 20), + lambda df: df.clip(0, 10), + lambda df: df.rolling(1).mean(), + lambda df: df.interpolate(), + lambda df: df.shift(), + lambda df: df.sort_values(1), + lambda df: df.round(), + lambda df: df.rank(), ], ) def test_op_preserves_column_metadata(column, operation):