From 2773b3fc19b3b092bd8131dcaad20a8ce28b238a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 14 Jun 2024 16:43:12 +0000 Subject: [PATCH 1/4] Translate temporal function and implement handler for year extraction --- python/cudf_polars/cudf_polars/dsl/expr.py | 40 +++++++++++++++++++ .../cudf_polars/cudf_polars/dsl/translate.py | 7 ++++ .../tests/expressions/test_datetime_basic.py | 26 ++++++++++++ 3 files changed, 73 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fe859c8d958..d3fcf7a7840 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -44,6 +44,7 @@ "Col", "BooleanFunction", "StringFunction", + "TemporalFunction", "Sort", "SortBy", "Gather", @@ -779,6 +780,45 @@ def do_evaluate( ) # pragma: no cover; handled by init raising +class TemporalFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.TemporalFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + if self.name != pl_expr.TemporalFunction.Year: + raise NotImplementedError(f"String function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.TemporalFunction.Year: + (column,) = columns + return Column(plc.datetime.extract_year(column.obj)) + raise NotImplementedError( + f"TemporalFunction {self.name}" + ) # pragma: no cover; init trips first + + class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a2fdb3c3d79..f75cb1b1f6f 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -361,6 +361,13 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex options, *(translate_expr(visitor, n=n) for n in node.input), ) + elif isinstance(name, pl_expr.TemporalFunction): + return expr.TemporalFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) else: raise NotImplementedError(f"No handler for Expr function node with {name=}") diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 6ba2a1dce1e..e36c1349133 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import datetime +from operator import methodcaller + import pytest import polars as pl @@ -32,3 +35,26 @@ def test_datetime_dataframe_scan(dtype): query = ldf.select(pl.col("b"), pl.col("a")) assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "field", + [ + methodcaller("year"), + pytest.param( + methodcaller("day"), + marks=pytest.mark.xfail(reason="day extraction not implemented"), + ), + ], +) +def test_datetime_extract(field): + ldf = pl.LazyFrame( + {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} + ) + q = ldf.select(field(pl.col("dates").dt)) + + with pytest.raises(AssertionError): + # polars produces int32, libcudf produces int16 for the year extraction + assert_gpu_result_equal(q) + + assert_gpu_result_equal(q, check_dtypes=False) From 2a40b2e982ea2f60710ac2a069985a4022e00f5f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 14 Jun 2024 16:56:28 +0000 Subject: [PATCH 2/4] Translate round and unique unary operations And add evaluation handlers. - Closes #16169 --- python/cudf_polars/cudf_polars/dsl/expr.py | 84 +++++++++++++++++++ python/cudf_polars/cudf_polars/dsl/ir.py | 2 +- .../cudf_polars/cudf_polars/dsl/translate.py | 12 ++- .../tests/expressions/test_round.py | 37 ++++++++ .../tests/expressions/test_unique.py | 24 ++++++ python/cudf_polars/tests/test_groupby.py | 2 + 6 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 python/cudf_polars/tests/expressions/test_round.py create mode 100644 python/cudf_polars/tests/expressions/test_unique.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index d3fcf7a7840..4c19905c35d 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -819,6 +819,90 @@ def do_evaluate( ) # pragma: no cover; init trips first +class UnaryFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr + ) -> None: + super().__init__(dtype) + self.name = name + self.options = options + self.children = children + if self.name not in ("round", "unique"): + raise NotImplementedError(f"Unary function {name=}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == "round": + (decimal_places,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.round.round( + values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP + ) + ).sorted_like(values) + elif self.name == "unique": + (maintain_order,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + # Only one column, so keep_any is the same as keep_first + # for stable distinct + keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY + if values.is_sorted: + maintain_order = True + result = plc.stream_compaction.unique( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if maintain_order + else plc.stream_compaction.distinct + ) + result = distinct( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + (column,) = result.columns() + if maintain_order: + return Column(column).sorted_like(values) + return Column(column) + raise NotImplementedError( + f"Unimplemented unary function {self.name=}" + ) # pragma: no cover; init trips first + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, groupby + # construction has checked that we don't have nested aggs, + # so stop the recursion and return ourselves for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + (child,) = self.children + return child.collect_agg(depth=depth) + + class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9b3096becd4..44fa088e41c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -431,7 +431,7 @@ def check_agg(agg: expr.Expr) -> int: NotImplementedError For unsupported expression nodes. """ - if isinstance(agg, (expr.BinOp, expr.Cast)): + if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)): return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): return 1 + max(GroupBy.check_agg(child) for child in agg.children) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index f75cb1b1f6f..51a8d910185 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -368,8 +368,16 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex options, *(translate_expr(visitor, n=n) for n in node.input), ) - else: - raise NotImplementedError(f"No handler for Expr function node with {name=}") + elif isinstance(name, str): + return expr.UnaryFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + raise NotImplementedError( + f"No handler for Expr function node with {name=}" + ) # pragma: no cover; polars raises on the rust side for now @_translate_expr.register diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py new file mode 100644 index 00000000000..9d20fca6b23 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_round.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import math + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[pl.Float32, pl.Float64]) +def dtype(request): + return request.param + + +@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) +def with_nulls(request): + return request.param + + +@pytest.fixture +def df(dtype, with_nulls): + a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8] + if with_nulls: + a[2] = None + a[-1] = None + return pl.LazyFrame({"a": a}, schema={"a": dtype}) + + +@pytest.mark.parametrize("decimals", [0, 2, 4]) +def test_round(df, decimals): + q = df.select(pl.col("a").round(decimals=decimals)) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py new file mode 100644 index 00000000000..9b009a422c2 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_unique.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"]) +def test_unique(maintain_order, pre_sorted): + ldf = pl.DataFrame( + { + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + } + ).lazy() + if pre_sorted: + ldf = ldf.sort("b") + + query = ldf.select(pl.col("b").unique(maintain_order=maintain_order)) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index aefad59eb91..cc5970acb35 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -47,6 +47,8 @@ def keys(request): [pl.col("float").max() - pl.col("int").min()], [pl.col("float").mean(), pl.col("int").std()], [(pl.col("float") - pl.lit(2)).max()], + [pl.col("float").sum().round(decimals=1)], + [pl.col("float").round(decimals=1).sum()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) From db90430fb33a3a20809bef2d0ceee668fdd51ba4 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 11:18:57 +0000 Subject: [PATCH 3/4] Link to cudf issue --- python/cudf_polars/tests/expressions/test_datetime_basic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index e36c1349133..218101bf87c 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -55,6 +55,8 @@ def test_datetime_extract(field): with pytest.raises(AssertionError): # polars produces int32, libcudf produces int16 for the year extraction + # libcudf can lose data here. + # https://github.com/rapidsai/cudf/issues/16196 assert_gpu_result_equal(q) assert_gpu_result_equal(q, check_dtypes=False) From 69ff4dfaa0bae21330f576d63b14439226c0bdf3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 11:19:18 +0000 Subject: [PATCH 4/4] Use common with_nulls fixture --- python/cudf_polars/tests/expressions/test_round.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py index 9d20fca6b23..3af3a0ce6d1 100644 --- a/python/cudf_polars/tests/expressions/test_round.py +++ b/python/cudf_polars/tests/expressions/test_round.py @@ -16,11 +16,6 @@ def dtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture def df(dtype, with_nulls): a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8]