diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40adeb28d47b6..7b046cb3a515b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -703,6 +703,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) +- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f8cb99e2b2e75..8d94d61a10739 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8524,7 +8524,7 @@ def idxmin(self, axis=0, skipna=True) -> Series: indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) index = self._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return Series(result, index=self._get_agg_axis(axis)) + return self._constructor_sliced(result, index=self._get_agg_axis(axis)) def idxmax(self, axis=0, skipna=True) -> Series: """ @@ -8591,7 +8591,7 @@ def idxmax(self, axis=0, skipna=True) -> Series: indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) index = self._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return Series(result, index=self._get_agg_axis(axis)) + return self._constructor_sliced(result, index=self._get_agg_axis(axis)) def _get_agg_axis(self, axis_num: int) -> Index: """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b35798079ba7f..04a3524fed480 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -330,7 +330,7 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - return DataFrame(results, columns=columns) + return self.obj._constructor_expanddim(results, columns=columns) def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, @@ -359,10 +359,12 @@ def _wrap_series_output( result: Union[Series, DataFrame] if len(output) > 1: - result = DataFrame(indexed_output, index=index) + result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns else: - result = Series(indexed_output[0], index=index, name=columns[0]) + result = self.obj._constructor( + indexed_output[0], index=index, name=columns[0] + ) return result @@ -421,7 +423,9 @@ def _wrap_transformed_output( def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self._selection_name, index=keys, dtype=np.float64) + return self.obj._constructor( + [], name=self._selection_name, index=keys, dtype=np.float64 + ) def _get_index() -> Index: if self.grouper.nkeys > 1: @@ -433,7 +437,9 @@ def _get_index() -> Index: if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result = self._reindex_output(DataFrame(values, index=index)) + result = self._reindex_output( + self.obj._constructor_expanddim(values, index=index) + ) # if self.observed is False, # keep all-NaN rows created while re-indexing result = result.stack(dropna=self.observed) @@ -447,7 +453,9 @@ def _get_index() -> Index: return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(data=values, index=_get_index(), name=self._selection_name) + result = self.obj._constructor( + data=values, index=_get_index(), name=self._selection_name + ) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -527,7 +535,7 @@ def _transform_general( result = concat(results).sort_index() else: - result = Series(dtype=np.float64) + result = self.obj._constructor(dtype=np.float64) # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs @@ -550,7 +558,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: out = algorithms.take_1d(result._values, ids) if cast: out = maybe_cast_result(out, self.obj, how=func_nm) - return Series(out, index=self.obj.index, name=self.obj.name) + return self.obj._constructor(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -651,7 +659,7 @@ def nunique(self, dropna: bool = True) -> Series: res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - result = Series(res, index=ri, name=self._selection_name) + result = self.obj._constructor(res, index=ri, name=self._selection_name) return self._reindex_output(result, fill_value=0) @doc(Series.describe) @@ -753,7 +761,7 @@ def value_counts( if is_integer_dtype(out): out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.obj._constructor(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -785,7 +793,7 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: if is_integer_dtype(out): out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.obj._constructor(out, index=mi, name=self._selection_name) def count(self) -> Series: """ @@ -804,7 +812,7 @@ def count(self) -> Series: minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - result = Series( + result = self.obj._constructor( out, index=self.grouper.result_index, name=self._selection_name, @@ -1202,11 +1210,11 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: if cannot_agg: result_columns = result_columns.drop(cannot_agg) - return DataFrame(result, columns=result_columns) + return self.obj._constructor(result, columns=result_columns) def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: - return DataFrame(index=keys) + return self.obj._constructor(index=keys) key_names = self.grouper.names @@ -1216,7 +1224,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if first_not_none is None: # GH9684. If all values are None, then this will throw an error. # We'd prefer it return an empty dataframe. - return DataFrame() + return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: @@ -1247,13 +1255,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if first_not_none is None: - return DataFrame() + return self.obj._constructor() elif isinstance(first_not_none, NDFrame): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if first_not_none._constructor is Series: + if isinstance(first_not_none, Series): backup = create_series_with_explicit_dtype( **kwargs, dtype_if_empty=object ) @@ -1320,7 +1328,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): or isinstance(key_index, MultiIndex) ): stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( + result = self.obj._constructor( stacked_values, index=key_index, columns=index ) else: @@ -1337,7 +1345,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): result.columns = index elif isinstance(v, ABCSeries): stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( + result = self.obj._constructor( stacked_values.T, index=v.index, columns=key_index ) else: @@ -1345,7 +1353,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # fall through to the outer else clause # TODO: sure this is right? we used to do this # after raising AttributeError above - return Series(values, index=key_index, name=self._selection_name) + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -1362,7 +1372,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return Series(values, index=key_index) + return self.obj._constructor_sliced(values, index=key_index) else: # Handle cases like BinGrouper @@ -1396,7 +1406,9 @@ def _transform_general( if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_func # Return the result as a DataFrame for concatenation later - res = DataFrame(res, index=group.index, columns=group.columns) + res = self.obj._constructor( + res, index=group.index, columns=group.columns + ) else: # Try slow path and fast path. try: @@ -1419,7 +1431,7 @@ def _transform_general( r.columns = group.columns r.index = group.index else: - r = DataFrame( + r = self.obj._constructor( np.concatenate([res.values] * len(group.index)).reshape( group.shape ), @@ -1495,7 +1507,9 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) - return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) + return self.obj._constructor._from_arrays( + output, columns=result.columns, index=obj.index + ) def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): @@ -1557,7 +1571,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: if len(output) < len(obj.columns): columns = columns.take(inds) - return DataFrame(output, index=obj.index, columns=columns) + return self.obj._constructor(output, index=obj.index, columns=columns) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -1672,9 +1686,11 @@ def _wrap_frame_output(self, result, obj) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: - return DataFrame(result, index=obj.columns, columns=result_index).T + return self.obj._constructor( + result, index=obj.columns, columns=result_index + ).T else: - return DataFrame(result, index=obj.index, columns=result_index) + return self.obj._constructor(result, index=obj.index, columns=result_index) def _get_data_to_aggregate(self) -> BlockManager: obj = self._obj_with_exclusions @@ -1718,7 +1734,7 @@ def _wrap_aggregated_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result = DataFrame(indexed_output) + result = self.obj._constructor(indexed_output) result.columns = columns if not self.as_index: @@ -1751,7 +1767,7 @@ def _wrap_transformed_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result = DataFrame(indexed_output) + result = self.obj._constructor(indexed_output) result.columns = columns result.index = self.obj.index @@ -1761,14 +1777,14 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) - result = DataFrame(mgr) + result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index mgr = BlockManager(blocks, axes=[items, index]) - result = DataFrame(mgr) + result = self.obj._constructor(mgr) if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 81c3fd7ad9e89..08d18397d7225 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1182,6 +1182,14 @@ class GroupBy(_GroupBy[FrameOrSeries]): more """ + @property + def _obj_1d_constructor(self) -> Type["Series"]: + # GH28330 preserve subclassed Series/DataFrames + if isinstance(self.obj, DataFrame): + return self.obj._constructor_sliced + assert isinstance(self.obj, Series) + return self.obj._constructor + def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1420,8 +1428,11 @@ def size(self): """ result = self.grouper.size() - if isinstance(self.obj, Series): - result.name = self.obj.name + # GH28330 preserve subclassed Series/DataFrames through calls + if issubclass(self.obj._constructor, Series): + result = self._obj_1d_constructor(result, name=self.obj.name) + else: + result = self._obj_1d_constructor(result) return self._reindex_output(result, fill_value=0) @classmethod @@ -2116,7 +2127,7 @@ def ngroup(self, ascending: bool = True): """ with _group_selection_context(self): index = self._selected_obj.index - result = Series(self.grouper.group_info[0], index) + result = self._obj_1d_constructor(self.grouper.group_info[0], index) if not ascending: result = self.ngroups - 1 - result return result @@ -2178,7 +2189,7 @@ def cumcount(self, ascending: bool = True): with _group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) - return Series(cumcounts, index) + return self._obj_1d_constructor(cumcounts, index) @Substitution(name="groupby") @Appender(_common_see_also) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 2f66cbf44788d..28dfaea8ed425 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -469,7 +469,9 @@ def get_result(self): # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) - cons = DataFrame + + # GH28330 Preserves subclassed objects through concat + cons = self.objs[0]._constructor_expanddim index, columns = self.new_axes df = cons(data, index=index) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 16bf651829a04..f4f7cc5c1a7d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -573,3 +573,17 @@ def test_subclassed_boolean_reductions(self, all_boolean_reductions): df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result = getattr(df, all_boolean_reductions)() assert isinstance(result, tm.SubclassedSeries) + + def test_idxmin_preserves_subclass(self): + # GH 28330 + + df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + result = df.idxmin() + assert isinstance(result, tm.SubclassedSeries) + + def test_idxmax_preserves_subclass(self): + # GH 28330 + + df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + result = df.idxmax() + assert isinstance(result, tm.SubclassedSeries) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py new file mode 100644 index 0000000000000..6adae19005c3a --- /dev/null +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -0,0 +1,77 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "obj", + [ + tm.SubclassedDataFrame({"A": np.arange(0, 10)}), + tm.SubclassedSeries(np.arange(0, 10), name="A"), + ], +) +def test_groupby_preserves_subclass(obj, groupby_func): + # GH28330 -- preserve subclass through groupby operations + + if isinstance(obj, Series) and groupby_func in {"corrwith"}: + pytest.skip("Not applicable") + + grouped = obj.groupby(np.arange(0, 10)) + + # Groups should preserve subclass type + assert isinstance(grouped.get_group(0), type(obj)) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(obj) + elif groupby_func == "tshift": + args.extend([0, 0]) + + result1 = getattr(grouped, groupby_func)(*args) + result2 = grouped.agg(groupby_func, *args) + + # Reduction or transformation kernels should preserve type + slices = {"ngroup", "cumcount", "size"} + if isinstance(obj, DataFrame) and groupby_func in slices: + assert isinstance(result1, obj._constructor_sliced) + else: + assert isinstance(result1, type(obj)) + + # Confirm .agg() groupby operations return same results + if isinstance(result1, DataFrame): + tm.assert_frame_equal(result1, result2) + else: + tm.assert_series_equal(result1, result2) + + +@pytest.mark.parametrize( + "obj", [DataFrame, tm.SubclassedDataFrame], +) +def test_groupby_resample_preserves_subclass(obj): + # GH28330 -- preserve subclass through groupby.resample() + + df = obj( + { + "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) + df = df.set_index("Date") + + # Confirm groupby.resample() preserves dataframe type + result = df.groupby("Buyer").resample("5D").sum() + assert isinstance(result, obj) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 6625ab86cfed4..b7fd12326bcfe 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2817,3 +2817,17 @@ def test_duplicate_keys(keys): ) expected = DataFrame(expected_values, columns=expected_columns) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "obj", + [ + tm.SubclassedDataFrame({"A": np.arange(0, 10)}), + tm.SubclassedSeries(np.arange(0, 10), name="A"), + ], +) +def test_concat_preserves_subclass(obj): + # GH28330 -- preserve subclass + + result = concat([obj, obj]) + assert isinstance(result, type(obj))