From 89f4a99c9884199415e82949b3e594049aa9cf71 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 12 Jun 2024 10:44:17 +0000 Subject: [PATCH 1/3] Support passing optimization options into equality testing Sometimes we might want to avoid running certain optimization passes, so enable that in the testing assertion function. --- python/cudf_polars/cudf_polars/testing/asserts.py | 14 ++++++++++++-- python/cudf_polars/cudf_polars/typing/__init__.py | 15 ++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 2f19b41cc3a..3edaa427432 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -13,14 +13,19 @@ from cudf_polars.callback import execute_with_cudf if TYPE_CHECKING: + from collections.abc import Mapping + import polars as pl + from cudf_polars.typing import OptimizationArgs + __all__: list[str] = ["assert_gpu_result_equal"] def assert_gpu_result_equal( lazydf: pl.LazyFrame, *, + collect_kwargs: Mapping[OptimizationArgs, bool] | None = None, check_row_order: bool = True, check_column_order: bool = True, check_dtypes: bool = True, @@ -36,6 +41,9 @@ def assert_gpu_result_equal( ---------- lazydf frame to collect. + collect_kwargs + Keyword arguments to pass to collect. Useful for controlling + optimization settings. check_row_order Expect rows to be in same order check_column_order @@ -59,9 +67,11 @@ def assert_gpu_result_equal( NotImplementedError If GPU collection failed in some way. """ - expect = lazydf.collect() + collect_kwargs = {} if collect_kwargs is None else collect_kwargs + expect = lazydf.collect(**collect_kwargs) got = lazydf.collect( - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) + **collect_kwargs, + post_opt_callback=partial(execute_with_cudf, raise_on_fail=True), ) assert_frame_equal( expect, diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index 287c977f4eb..6d597a91724 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -6,7 +6,7 @@ from __future__ import annotations from collections.abc import Mapping -from typing import TYPE_CHECKING, Protocol, TypeAlias +from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir @@ -89,3 +89,16 @@ def set_udf( ) -> None: """Set the callback replacing the current node in the plan.""" ... + + +OptimizationArgs: TypeAlias = Literal[ + "type_coercion", + "predicate_pushdown", + "projection_pushdown", + "simplify_expression", + "slice_pushdown", + "comm_subplan_elim", + "comm_subexpr_elim", + "cluster_with_columns", + "no_optimization", +] From 04a2eb9677e38b5ace91814264e3ad6c3a010b12 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 12 Jun 2024 10:45:43 +0000 Subject: [PATCH 2/3] Correctly implement slicing for all values Polars wraps negative starts and then clamps both the resulting start and length to [0, num_rows), so we should do that. Add tests of this behaviour as well. --- .../cudf_polars/cudf_polars/containers/dataframe.py | 13 ++++++++----- python/cudf_polars/tests/test_slice.py | 9 +++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 7039fcaf077..d1f7a9ed2cf 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -96,7 +96,7 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: Returns ------- - New dataframe sharing data with the input table. + New dataframe sharing data with the input table. Raises ------ @@ -205,15 +205,18 @@ def slice(self, zlice: tuple[int, int] | None) -> Self: Returns ------- - New dataframe (if zlice is not None) other self (if it is) + New dataframe (if zlice is not None) otherwise self (if it is) """ if zlice is None: return self start, length = zlice if start < 0: start += self.num_rows - # Polars slice takes an arbitrary positive integer and slice - # to the end of the frame if it is larger. - end = min(start + length, self.num_rows) + # Polars implementation wraps negative start by num_rows, then + # adds length to start to get the end, then clamps both to + # [0, num_rows) + end = start + length + start = max(min(start, self.num_rows), 0) + end = max(min(end, self.num_rows), 0) (table,) = plc.copying.slice(self.table, [start, end]) return type(self).from_table(table, self.column_names).sorted_like(self) diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py index d27e91302ba..1191d0b3a16 100644 --- a/python/cudf_polars/tests/test_slice.py +++ b/python/cudf_polars/tests/test_slice.py @@ -11,13 +11,14 @@ @pytest.mark.parametrize( "offset", - [0, 1, 2], + [0, 1, 2, -10, -20, -1, -2, 20], ) @pytest.mark.parametrize( "len", - [0, 2, 12], + [0, 2, 12, 11], ) -def test_slice(offset, len): +@pytest.mark.parametrize("slice_pushdown", [False, True]) +def test_slice(offset, len, slice_pushdown): ldf = pl.DataFrame( { "a": [1, 2, 3, 4, 5, 6, 7], @@ -31,4 +32,4 @@ def test_slice(offset, len): .sort(by=pl.col("a")) .slice(offset, len) ) - assert_gpu_result_equal(query) + assert_gpu_result_equal(query, collect_kwargs={"slice_pushdown": slice_pushdown}) From 5033b41b84bebc5b0225e3a2ca66ff1610b11e95 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 12 Jun 2024 11:55:01 +0000 Subject: [PATCH 3/3] Rename function argument to avoid shadowing len --- python/cudf_polars/tests/test_slice.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py index 1191d0b3a16..8ea5c623ae7 100644 --- a/python/cudf_polars/tests/test_slice.py +++ b/python/cudf_polars/tests/test_slice.py @@ -14,11 +14,11 @@ [0, 1, 2, -10, -20, -1, -2, 20], ) @pytest.mark.parametrize( - "len", + "length", [0, 2, 12, 11], ) @pytest.mark.parametrize("slice_pushdown", [False, True]) -def test_slice(offset, len, slice_pushdown): +def test_slice(offset, length, slice_pushdown): ldf = pl.DataFrame( { "a": [1, 2, 3, 4, 5, 6, 7], @@ -30,6 +30,6 @@ def test_slice(offset, len, slice_pushdown): ldf.group_by(pl.col("a")) .agg(pl.col("b").sum()) .sort(by=pl.col("a")) - .slice(offset, len) + .slice(offset, length) ) assert_gpu_result_equal(query, collect_kwargs={"slice_pushdown": slice_pushdown})