From 2ce8c87b712695b68b767c5fb51d61f4ed7908f3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 Nov 2023 16:13:11 +0000 Subject: [PATCH 1/2] Add test of #14200 --- python/cudf/cudf/tests/groupby/test_agg.py | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 python/cudf/cudf/tests/groupby/test_agg.py diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py new file mode 100644 index 00000000000..692c6dc75be --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -0,0 +1,26 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +import numpy as np +import pytest + +import cudf + + +@pytest.mark.parametrize( + "empty", + [ + pytest.param( + True, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/14200" + ), + ), + False, + ], + ids=["empty", "nonempty"], +) +def test_agg_count_dtype(empty): + df = cudf.DataFrame({"a": [1, 2, 1], "c": ["a", "b", "c"]}) + if empty: + df = df.iloc[:0] + result = df.groupby("a").agg({"c": "count"}) + assert result["c"].dtype == np.dtype("int64") From fb3a0ba98f7bb9ec774c85928a13b678515f5314 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 Nov 2023 16:15:26 +0000 Subject: [PATCH 2/2] Correct dtype of count aggregations on empty dataframes A count aggregation should always return an int64 column, even if the grouped dataframe is empty. Previously we did not do this because the short-circuiting for empty inputs was hit before handling the count case. Fix this by reordering the conditions. - Closes #14200 --- python/cudf/cudf/core/groupby/groupby.py | 6 +++--- python/cudf/cudf/tests/groupby/test_agg.py | 10 +--------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index c48e5109ff2..73e6774f5ce 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -592,7 +592,9 @@ def agg(self, func): # Structs lose their labels which we reconstruct here col = col._with_type_metadata(cudf.ListDtype(orig_dtype)) - if ( + if agg_kind in {"COUNT", "SIZE"}: + data[key] = col.astype("int64") + elif ( self.obj.empty and ( isinstance(agg_name, str) @@ -609,8 +611,6 @@ def agg(self, func): ) ): data[key] = col.astype(orig_dtype) - elif agg_kind in {"COUNT", "SIZE"}: - data[key] = col.astype("int64") else: data[key] = col data = ColumnAccessor(data, multiindex=multilevel) diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index 692c6dc75be..7919ee4a9f1 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -7,15 +7,7 @@ @pytest.mark.parametrize( "empty", - [ - pytest.param( - True, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/14200" - ), - ), - False, - ], + [True, False], ids=["empty", "nonempty"], ) def test_agg_count_dtype(empty):