From cb0582102db4af849aacdde1c4e16565a08c684b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:15:51 +0000 Subject: [PATCH 1/3] Handle case of broadcasting empty list of columns --- python/cudf_polars/cudf_polars/dsl/ir.py | 2 ++ python/cudf_polars/tests/test_union.py | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9b3096becd4..31a0be004ea 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -96,6 +96,8 @@ def broadcast( ``target_length`` is provided and not all columns are length-1 (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. """ + if len(columns) == 0: + return [] lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index b021d832910..865b95a7d91 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -46,3 +46,12 @@ def test_concat_vertical(): q = pl.concat([ldf, ldf2], how="vertical") assert_gpu_result_equal(q) + + +def test_concat_diagonal_empty(): + df1 = pl.LazyFrame() + df2 = pl.LazyFrame({"a": [1, 2]}) + + q = pl.concat([df1, df2], how="diagonal_relaxed") + + assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True}) From c4544d59e50825289429ab140ba6c0c6af7bf030 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:19:09 +0000 Subject: [PATCH 2/3] Allow specifying exceptions to catch in execute_with_cudf Default to Exception so we can catch errors from arrow and other third-party libraries. --- python/cudf_polars/cudf_polars/callback.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 979087d5273..764cdd3b3ca 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -34,7 +34,12 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf( + nt: NodeTraverser, + *, + raise_on_fail: bool = False, + exception: type[Exception] | tuple[type[Exception], ...] = Exception, +) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None Should conversion raise an exception rather than continuing without setting a callback. + exception + Optional exception, or tuple of exceptions, to catch during + translation. Defaults to ``Exception``. + The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): nt.set_udf(partial(_callback, translate_ir(nt))) - except NotImplementedError: + except exception: if raise_on_fail: raise From f594e97615f0d4ae39e93b68574bc21f6ca4dea1 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:20:41 +0000 Subject: [PATCH 3/3] Fix bug in documented behaviour of with_columns We were not previously discarding overlapping column names. --- python/cudf_polars/cudf_polars/containers/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ec8d00c3123..d86656578d7 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,6 +5,7 @@ from __future__ import annotations +import itertools from functools import cached_property from typing import TYPE_CHECKING, cast @@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns]) + columns = list( + {c.name: c for c in itertools.chain(self.columns, columns)}.values() + ) + return type(self)(columns) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name."""