diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 54f8958c9eb..a56f70e7ae2 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -685,8 +685,13 @@ def groupby_agg( "with `sort=False`, or set `shuffle=True`." ) + # Determine required columns to enable column projection + required_columns = list( + set(gb_cols).union(aggs.keys()).intersection(ddf.columns) + ) + return aca( - [ddf], + [ddf[required_columns]], chunk=chunk, chunk_kwargs=chunk_kwargs, combine=combine, diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index e43fead0b63..1f018e79ff7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -130,6 +130,10 @@ def test_groupby_agg(func, aggregation, pdf): assert_cudf_groupby_layers(actual) + # groupby.agg should add an explicit getitem layer + # to improve/enable column projection + assert hlg_layer(actual.dask, "getitem") + dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)