From 78929a15727ea5fd9ae437b819522ebbf3a81785 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 30 Apr 2024 16:18:30 +0000 Subject: [PATCH] Build out tests --- python/cudf_polars/cudf_polars/plan.py | 7 +- .../cudf_polars/testing/__init__.py | 2 + .../cudf_polars/testing/asserts.py | 60 ++++++++++++++++ python/cudf_polars/tests/conftest.py | 12 ++++ python/cudf_polars/tests/test_basic.py | 4 +- python/cudf_polars/tests/test_extcontext.py | 18 +++++ python/cudf_polars/tests/test_filter.py | 23 +++++++ python/cudf_polars/tests/test_hconcat.py | 18 +++++ python/cudf_polars/tests/test_hstack.py | 18 +++++ python/cudf_polars/tests/test_join.py | 68 +++++++++++++++++++ python/cudf_polars/tests/test_slice.py | 32 +++++++++ python/cudf_polars/tests/test_union.py | 22 ++++++ 12 files changed, 276 insertions(+), 8 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/testing/__init__.py create mode 100644 python/cudf_polars/cudf_polars/testing/asserts.py create mode 100644 python/cudf_polars/tests/conftest.py create mode 100644 python/cudf_polars/tests/test_extcontext.py create mode 100644 python/cudf_polars/tests/test_filter.py create mode 100644 python/cudf_polars/tests/test_hconcat.py create mode 100644 python/cudf_polars/tests/test_hstack.py create mode 100644 python/cudf_polars/tests/test_join.py create mode 100644 python/cudf_polars/tests/test_slice.py create mode 100644 python/cudf_polars/tests/test_union.py diff --git a/python/cudf_polars/cudf_polars/plan.py b/python/cudf_polars/cudf_polars/plan.py index 30794c7b653..78eb48e4753 100644 --- a/python/cudf_polars/cudf_polars/plan.py +++ b/python/cudf_polars/cudf_polars/plan.py @@ -436,8 +436,6 @@ def _join(plan: nodes.Join, visitor: PlanVisitor): raise NotImplementedError("cross join not implemented") if how == "outer" and not coalesce_key_columns: raise NotImplementedError("Non-coalescing outer join") - elif how == "outer_coalesce": - how = "outer" joiner, left_policy, right_policy = { "inner": ( plc.join.inner_join, @@ -738,6 +736,7 @@ def _union(plan: nodes.Union, visitor: PlanVisitor): all_names = list( itertools.chain.from_iterable(t.names() for t in input_tables) ) + # TODO: use polars schema schema = reduce(operator.or_, (t.schema() for t in input_tables)) tables = [ plc.Table( @@ -777,10 +776,6 @@ def _hconcat(plan: nodes.HConcat, visitor: PlanVisitor): @_execute_plan.register def _extcontext(plan: nodes.ExtContext, visitor: PlanVisitor): result = visitor(plan.input) - # TODO: This is not right, e.g. if there is a projection that - # selects some subset of the columns. But it seems it is not - # pushed inside the ExtContext node, so we need some other way of - # handling that. return DataFrame( reduce( operator.or_, diff --git a/python/cudf_polars/cudf_polars/testing/__init__.py b/python/cudf_polars/cudf_polars/testing/__init__.py new file mode 100644 index 00000000000..bee2a596af2 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py new file mode 100644 index 00000000000..61c728ec4b1 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from polars.testing.asserts import assert_frame_equal + + +def assert_gpu_result_equal( + lazydf, + *, + check_row_order: bool = True, + check_column_order: bool = True, + check_dtype: bool = True, + check_exact: bool = True, + rtol: float = 1e-05, + atol: float = 1e-08, + categorical_as_str: bool = False, +): + """ + Assert that collection of a lazyframe on GPU produces correct results. + + Parameters + ---------- + lazydf + frame to collect. + check_row_order + Expect rows to be in same order + check_column_order + Expect columns to be in same order + check_dtype + Expect dtypes to match + check_exact + Require exact equality for floats, if `False` compare using + rtol and atol. + rtol + Relative tolerance for float comparisons + atol + Absolute tolerance for float comparisons + categorical_as_str + Decat categoricals to strings before comparing + + Raises + ------ + AssertionError + If the GPU and CPU collection do not match. + NotImplementedError + If GPU collection failed in some way. + """ + expect = lazydf.collect(use_gpu=False) + got = lazydf.collect(use_gpu=True, cpu_fallback=False) + assert_frame_equal( + expect, + got, + check_row_order=check_row_order, + check_column_order=check_column_order, + check_dtype=check_dtype, + check_exact=check_exact, + rtol=rtol, + atol=atol, + categorical_as_str=categorical_as_str, + ) diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py new file mode 100644 index 00000000000..9d3ec633076 --- /dev/null +++ b/python/cudf_polars/tests/conftest.py @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + + +def pytest_sessionstart(session): + from cudf_polars.patch import _WAS_PATCHED + + if not _WAS_PATCHED: + # We could also just patch in the test, but this approach + # provides a canary for failures with patching that we might + # observe in trying this with other tests. + raise RuntimeError("Patch was not applied") diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index b8468258fd3..3567ec3669a 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -178,7 +178,7 @@ def test_agg(df, agg): .lazy() ) out = getattr(ldf, agg)() - assert_gpu_result_equal(out, check_dtype=agg != "count") + assert_gpu_result_equal(out, check_dtype=agg != "count", check_exact=False) @pytest.mark.parametrize("keep", ["first", "last", "none"]) @@ -226,7 +226,7 @@ def test_concat_horizontal(ldf): def test_groupby(ldf): out = ldf.group_by("int_key1").agg(pl.col("float_val").sum()) - assert_gpu_result_equal(out, check_row_order=False) + assert_gpu_result_equal(out, check_row_order=False, check_exact=False) def test_expr_function(ldf): diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py new file mode 100644 index 00000000000..c17765066d4 --- /dev/null +++ b/python/cudf_polars/tests/test_extcontext.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_extcontext(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c")) + query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c")) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/test_filter.py new file mode 100644 index 00000000000..0c11655877b --- /dev/null +++ b/python/cudf_polars/tests/test_filter.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_filter(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + # group-by is just to avoid the filter being pushed into the scan. + query = ( + ldf.group_by(pl.col("a")) + .agg(pl.col("b").sum()) + .filter(pl.col("b") < 1) + ) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py new file mode 100644 index 00000000000..da79200e424 --- /dev/null +++ b/python/cudf_polars/tests/test_hconcat.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_hconcat(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c")) + query = pl.concat([ldf, ldf2], how="horizontal") + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py new file mode 100644 index 00000000000..7d8e148635c --- /dev/null +++ b/python/cudf_polars/tests/test_hstack.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_hstack(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.with_columns(pl.col("a") + pl.col("b")) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py new file mode 100644 index 00000000000..5f9a32c588a --- /dev/null +++ b/python/cudf_polars/tests/test_join.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl +import pytest + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "how", + [ + "inner", + "left", + pytest.param( + "outer", + marks=pytest.mark.xfail( + reason="non-coalescing join not implemented" + ), + ), + "semi", + "anti", + pytest.param( + "cross", + marks=pytest.mark.xfail(reason="cross join not implemented"), + ), + "outer_coalesce", + ], +) +@pytest.mark.parametrize( + "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"] +) +@pytest.mark.parametrize( + "join_expr", + [ + pl.col("a"), + pytest.param( + pl.col("a") * 2, + marks=pytest.mark.xfail( + reason="Taking key columns from wrong table" + ), + ), + pytest.param( + [pl.col("a"), pl.col("a") + 1], + marks=pytest.mark.xfail( + reason="Taking key columns from wrong table" + ), + ), + ["c", "a"], + ], +) +def test_join(how, join_nulls, join_expr): + left = pl.DataFrame( + { + "a": [1, 2, 3, 1, None], + "b": [1, 2, 3, 4, 5], + "c": [2, 3, 4, 5, 6], + } + ).lazy() + right = pl.DataFrame( + { + "a": [1, 4, 3, 7, None, None], + "c": [2, 3, 4, 5, 6, 7], + } + ).lazy() + + query = left.join(right, on=join_expr, how=how, join_nulls=join_nulls) + assert_gpu_result_equal(query, check_row_order=False) diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py new file mode 100644 index 00000000000..7829d21bb42 --- /dev/null +++ b/python/cudf_polars/tests/test_slice.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl +import pytest + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "offset", + [0, 1, 2], +) +@pytest.mark.parametrize( + "len", + [0, 2, 12], +) +def test_slice(offset, len): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ( + ldf.group_by(pl.col("a")) + .agg(pl.col("b").sum()) + .sort(by=pl.col("a")) + .slice(offset, len) + ) + assert_gpu_result_equal(query, check_row_order=False) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py new file mode 100644 index 00000000000..9c8ae0d51ab --- /dev/null +++ b/python/cudf_polars/tests/test_union.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +import polars as pl +import pytest + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.xfail(reason="Need handling of null scalars that are cast") +def test_union(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a")) + query = pl.concat([ldf, ldf2], how="diagonal") + # Plan for this produces a `None`.astype(Int64) which we don't + # handle correctly right now + assert_gpu_result_equal(query)