From f881c6c52cb5c7ee96520548a2423da06ffe2d25 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 5 May 2023 13:11:00 -0400 Subject: [PATCH] Change default value of the `observed=` argument in groupby to `True` to reflect the actual behaviour (#13296) When grouping by a categorical, we default to the behaviour `observed=True`. I'm changing the value of the kwarg to reflect that: ```python In [43]: df = pd.DataFrame({'a': [1, 2, 3, 4]}) In [44]: idx = pd.CategoricalIndex([1, 1, 2, 3], categories=[1, 2, 3, 4, 5]) In [45]: df.groupby(idx).sum() Out[45]: a 1 3 2 3 3 4 4 0 5 0 In [46]: df.groupby(idx, observed=True).sum() Out[46]: a 1 3 2 3 3 4 In [47]: df = cudf.DataFrame({'a': [1, 2, 3, 4]}) In [48]: idx = cudf.CategoricalIndex([1, 1, 2, 3], categories=[1, 2, 3, 4, 5]) In [49]: df.groupby(idx).sum() Out[49]: a 1 3 3 4 2 3 ``` Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13296 --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 4 ++-- python/cudf/cudf/core/series.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index af3ba801a82..de324515729 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4072,7 +4072,7 @@ def groupby( sort=False, group_keys=False, squeeze=False, - observed=False, + observed=True, dropna=True, ): return super().groupby( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 97dfb16bd32..95931af038c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3918,7 +3918,7 @@ def groupby( sort=False, group_keys=False, squeeze=False, - observed=False, + observed=True, dropna=True, ): if axis not in (0, "index"): @@ -3929,7 +3929,7 @@ def groupby( "squeeze parameter is not yet implemented" ) - if observed is not False: + if not observed: raise NotImplementedError( "observed parameter is not yet implemented" ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 57a3653edf1..6d4caebb8ad 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3297,7 +3297,7 @@ def groupby( sort=False, group_keys=False, squeeze=False, - observed=False, + observed=True, dropna=True, ): return super().groupby(