Skip to content

Commit

Permalink
Finish implementation of cudf-polars boolean function handlers (#16098)
Browse files Browse the repository at this point in the history
The missing nodes were `is_in`, `not` (both easy), `is_finite` and `is_infinite` (obtained by translating to `contains` calls).

While here, remove the implementation of `IsBetween` and just translate to an expression with binary operations. This removes the need for special-casing scalar arguments to `IsBetween` and reproducing the code for binop evaluation.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16098
  • Loading branch information
wence- authored Jun 27, 2024
1 parent 2ed69c9 commit c847b98
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 35 deletions.
67 changes: 38 additions & 29 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,12 +443,12 @@ def __init__(
):
# With ignore_nulls == False, polars uses Kleene logic
raise NotImplementedError(f"Kleene logic for {self.name}")
if self.name in (
pl_expr.BooleanFunction.IsFinite,
pl_expr.BooleanFunction.IsInfinite,
pl_expr.BooleanFunction.IsIn,
if self.name == pl_expr.BooleanFunction.IsIn and not all(
c.dtype == self.children[0].dtype for c in self.children
):
raise NotImplementedError(f"{self.name}")
# TODO: If polars IR doesn't put the casts in, we need to
# mimic the supertype promotion rules.
raise NotImplementedError("IsIn doesn't support supertype casting")

@staticmethod
def _distinct(
Expand Down Expand Up @@ -506,6 +506,33 @@ def do_evaluate(
mapping: Mapping[Expr, Column] | None = None,
) -> Column:
"""Evaluate this expression given a dataframe for context."""
if self.name in (
pl_expr.BooleanFunction.IsFinite,
pl_expr.BooleanFunction.IsInfinite,
):
# Avoid evaluating the child if the dtype tells us it's unnecessary.
(child,) = self.children
is_finite = self.name == pl_expr.BooleanFunction.IsFinite
if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
value = plc.interop.from_arrow(
pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
)
return Column(plc.Column.from_scalar(value, df.num_rows))
needles = child.evaluate(df, context=context, mapping=mapping)
to_search = [-float("inf"), float("inf")]
if is_finite:
# NaN is neither finite not infinite
to_search.append(float("nan"))
haystack = plc.interop.from_arrow(
pa.array(
to_search,
type=plc.interop.to_arrow(needles.obj.type()),
)
)
result = plc.search.contains(haystack, needles.obj)
if is_finite:
result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
return Column(result)
columns = [
child.evaluate(df, context=context, mapping=mapping)
for child in self.children
Expand Down Expand Up @@ -612,31 +639,13 @@ def do_evaluate(
(c.obj for c in columns),
)
)
elif self.name == pl_expr.BooleanFunction.IsBetween:
column, lo, hi = columns
(closed,) = self.options
lop, rop = self._BETWEEN_OPS[closed]
lo_obj = (
lo.obj_scalar
if lo.is_scalar and lo.obj.size() != column.obj.size()
else lo.obj
)
hi_obj = (
hi.obj_scalar
if hi.is_scalar and hi.obj.size() != column.obj.size()
else hi.obj
)
elif self.name == pl_expr.BooleanFunction.IsIn:
needles, haystack = columns
return Column(plc.search.contains(haystack.obj, needles.obj))
elif self.name == pl_expr.BooleanFunction.Not:
(column,) = columns
return Column(
plc.binaryop.binary_operation(
plc.binaryop.binary_operation(
column.obj, lo_obj, lop, output_type=self.dtype
),
plc.binaryop.binary_operation(
column.obj, hi_obj, rop, output_type=self.dtype
),
plc.binaryop.BinaryOperator.LOGICAL_AND,
self.dtype,
)
plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
)
else:
raise NotImplementedError(
Expand Down
10 changes: 10 additions & 0 deletions python/cudf_polars/cudf_polars/dsl/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,16 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
*(translate_expr(visitor, n=n) for n in node.input),
)
elif isinstance(name, pl_expr.BooleanFunction):
if name == pl_expr.BooleanFunction.IsBetween:
column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
(closed,) = options
lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
return expr.BinOp(
dtype,
plc.binaryop.BinaryOperator.LOGICAL_AND,
expr.BinOp(dtype, lop, column, lo),
expr.BinOp(dtype, rop, column, hi),
)
return expr.BooleanFunction(
dtype,
name,
Expand Down
48 changes: 42 additions & 6 deletions python/cudf_polars/tests/expressions/test_booleanfunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

import polars as pl

from cudf_polars.testing.asserts import assert_gpu_result_equal
from cudf_polars.testing.asserts import (
assert_gpu_result_equal,
assert_ir_translation_raises,
)


@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
Expand Down Expand Up @@ -67,23 +70,26 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):

df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})

q = df.select(expr(pl.col("a")))
q = df.select(expr(pl.col("a")), expr(pl.col("a")).not_().alias("b"))

assert_gpu_result_equal(q)


@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
@pytest.mark.parametrize(
"expr",
[
pl.col("a").is_finite(),
pl.col("a").is_infinite(),
pl.col("a").is_in(pl.col("b")),
[pl.col("a").is_infinite(), pl.col("b").is_finite()],
],
)
def test_unsupported_boolean_function(expr):
def test_boolean_finite(expr):
df = pl.LazyFrame(
{"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]}
{
"a": pl.Series([1, float("nan"), 2, float("inf")], dtype=pl.Float64()),
"b": [1, 2, 3, 4],
"c": pl.Series([1, 2, 3, 4], dtype=pl.Float64()),
}
)

q = df.select(expr)
Expand Down Expand Up @@ -133,3 +139,33 @@ def test_boolean_horizontal(request, expr, has_nulls, wide):
q = ldf.select(expr)

assert_gpu_result_equal(q)


@pytest.mark.parametrize(
"expr",
[
pl.col("a").is_in(pl.col("b")),
pl.col("a").is_in(pl.col("c")),
pl.col("c").is_in(pl.col("d")),
],
)
def test_boolean_is_in(expr):
ldf = pl.LazyFrame(
{
"a": pl.Series([1, 2, 3], dtype=pl.Int64()),
"b": pl.Series([3, 4, 2], dtype=pl.Int64()),
"c": pl.Series([1, None, 3], dtype=pl.Int64()),
"d": pl.Series([10, None, 11], dtype=pl.Int64()),
}
)

q = ldf.select(expr)

assert_gpu_result_equal(q)


def test_boolean_is_in_raises_unsupported():
ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))

assert_ir_translation_raises(q, NotImplementedError)

0 comments on commit c847b98

Please sign in to comment.