Skip to content

Commit

Permalink
Add test coverage for slicing with "out of bounds" negative indices (#…
Browse files Browse the repository at this point in the history
…15990)

Polars wraps negative starts and then clamps both the resulting start
and length to [0, num_rows), so we should do that.

Add tests of this behaviour as well.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #15990
  • Loading branch information
wence- authored Jun 12, 2024
1 parent 2b10299 commit 9fae8ab
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 14 deletions.
13 changes: 8 additions & 5 deletions python/cudf_polars/cudf_polars/containers/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
Returns
-------
New dataframe sharing data with the input table.
New dataframe sharing data with the input table.
Raises
------
Expand Down Expand Up @@ -205,15 +205,18 @@ def slice(self, zlice: tuple[int, int] | None) -> Self:
Returns
-------
New dataframe (if zlice is not None) other self (if it is)
New dataframe (if zlice is not None) otherwise self (if it is)
"""
if zlice is None:
return self
start, length = zlice
if start < 0:
start += self.num_rows
# Polars slice takes an arbitrary positive integer and slice
# to the end of the frame if it is larger.
end = min(start + length, self.num_rows)
# Polars implementation wraps negative start by num_rows, then
# adds length to start to get the end, then clamps both to
# [0, num_rows)
end = start + length
start = max(min(start, self.num_rows), 0)
end = max(min(end, self.num_rows), 0)
(table,) = plc.copying.slice(self.table, [start, end])
return type(self).from_table(table, self.column_names).sorted_like(self)
14 changes: 12 additions & 2 deletions python/cudf_polars/cudf_polars/testing/asserts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,19 @@
from cudf_polars.callback import execute_with_cudf

if TYPE_CHECKING:
from collections.abc import Mapping

import polars as pl

from cudf_polars.typing import OptimizationArgs

__all__: list[str] = ["assert_gpu_result_equal"]


def assert_gpu_result_equal(
lazydf: pl.LazyFrame,
*,
collect_kwargs: Mapping[OptimizationArgs, bool] | None = None,
check_row_order: bool = True,
check_column_order: bool = True,
check_dtypes: bool = True,
Expand All @@ -36,6 +41,9 @@ def assert_gpu_result_equal(
----------
lazydf
frame to collect.
collect_kwargs
Keyword arguments to pass to collect. Useful for controlling
optimization settings.
check_row_order
Expect rows to be in same order
check_column_order
Expand All @@ -59,9 +67,11 @@ def assert_gpu_result_equal(
NotImplementedError
If GPU collection failed in some way.
"""
expect = lazydf.collect()
collect_kwargs = {} if collect_kwargs is None else collect_kwargs
expect = lazydf.collect(**collect_kwargs)
got = lazydf.collect(
post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
**collect_kwargs,
post_opt_callback=partial(execute_with_cudf, raise_on_fail=True),
)
assert_frame_equal(
expect,
Expand Down
15 changes: 14 additions & 1 deletion python/cudf_polars/cudf_polars/typing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from __future__ import annotations

from collections.abc import Mapping
from typing import TYPE_CHECKING, Protocol, TypeAlias
from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias

from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir

Expand Down Expand Up @@ -89,3 +89,16 @@ def set_udf(
) -> None:
"""Set the callback replacing the current node in the plan."""
...


OptimizationArgs: TypeAlias = Literal[
"type_coercion",
"predicate_pushdown",
"projection_pushdown",
"simplify_expression",
"slice_pushdown",
"comm_subplan_elim",
"comm_subexpr_elim",
"cluster_with_columns",
"no_optimization",
]
13 changes: 7 additions & 6 deletions python/cudf_polars/tests/test_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@

@pytest.mark.parametrize(
"offset",
[0, 1, 2],
[0, 1, 2, -10, -20, -1, -2, 20],
)
@pytest.mark.parametrize(
"len",
[0, 2, 12],
"length",
[0, 2, 12, 11],
)
def test_slice(offset, len):
@pytest.mark.parametrize("slice_pushdown", [False, True])
def test_slice(offset, length, slice_pushdown):
ldf = pl.DataFrame(
{
"a": [1, 2, 3, 4, 5, 6, 7],
Expand All @@ -29,6 +30,6 @@ def test_slice(offset, len):
ldf.group_by(pl.col("a"))
.agg(pl.col("b").sum())
.sort(by=pl.col("a"))
.slice(offset, len)
.slice(offset, length)
)
assert_gpu_result_equal(query)
assert_gpu_result_equal(query, collect_kwargs={"slice_pushdown": slice_pushdown})

0 comments on commit 9fae8ab

Please sign in to comment.