Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement handlers for first/last in groupby #16688

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def __init__(
self.name = name
self.children = children
if self.name != pl_expr.TemporalFunction.Year:
raise NotImplementedError(f"String function {self.name}")
raise NotImplementedError(f"Temporal function {self.name}")

def do_evaluate(
self,
Expand Down Expand Up @@ -1540,7 +1540,17 @@ def collect_agg(self, *, depth: int) -> AggInfo:
raise NotImplementedError("Nan propagation in groupby for min/max")
(child,) = self.children
((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
if self.request is None:
if self.name == "first":
request = plc.aggregation.nth_element(
0, null_handling=plc.types.NullPolicy.INCLUDE
)
elif self.name == "last":
request = plc.aggregation.nth_element(
-1, null_handling=plc.types.NullPolicy.INCLUDE
)
else:
request = self.request
if request is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
else:
request = self.request
if request is None:
elif (request := self.request) is None:

Optional: Should avoid checking an always false condition for the first two if/elif above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, cute, but I don't like the lack of else fallthrough branch from a style point of view (this would be easier to handle if pylibcudf had typing).

I reordered so the assignment to request is unconditional and then overridden in the two exceptional cases instead.

raise NotImplementedError(
f"Aggregation {self.name} in groupby"
) # pragma: no cover; __init__ trips first
Expand All @@ -1549,7 +1559,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
# Ignore nans in these groupby aggs, do this by masking
# nans in the input
expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
return AggInfo([(expr, self.request, self)])
return AggInfo([(expr, request, self)])

def _reduce(
self, column: Column, *, request: plc.aggregation.Aggregation
Expand Down
1 change: 1 addition & 0 deletions python/cudf_polars/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def keys(request):
[(pl.col("float") - pl.lit(2)).max()],
[pl.col("float").sum().round(decimals=1)],
[pl.col("float").round(decimals=1).sum()],
[pl.col("int").first(), pl.col("float").last()],
],
ids=lambda aggs: "-".join(map(str, aggs)),
)
Expand Down
Loading