Enable automatic column projection in groupby().agg (#12124)

This PR corresponds to the Dask-cudf version of dask/dask#9442, which was found to improve the performance of many groupby-based workflows. After this PR, ```python import dask_cudf path = "/criteo-dataset/day_0.parquet" ddf = dask_cudf.read_parquet(path, split_row_groups=10) # The following takes <2s with this PR, and fails with # an OOM error on main (using a 32GB GPU): ddf.groupby("C1").agg({"C2": "mean"}).compute() ``` Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #12124
rapidsai · Nov 14, 2022 · 5081fb1 · 5081fb1
1 parent 825f049
commit 5081fb1
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
@@ -685,8 +685,13 @@ def groupby_agg(
             "with `sort=False`, or set `shuffle=True`."
         )
 
+    # Determine required columns to enable column projection
+    required_columns = list(
+        set(gb_cols).union(aggs.keys()).intersection(ddf.columns)
+    )
+
     return aca(
-        [ddf],
+        [ddf[required_columns]],
         chunk=chunk,
         chunk_kwargs=chunk_kwargs,
         combine=combine,

diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -130,6 +130,10 @@ def test_groupby_agg(func, aggregation, pdf):
 
     assert_cudf_groupby_layers(actual)
 
+    # groupby.agg should add an explicit getitem layer
+    # to improve/enable column projection
+    assert hlg_layer(actual.dask, "getitem")
+
     dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)