diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 1e4102e4b64..07e9d1ead11 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -55,6 +55,7 @@ ARROW_TO_PYLIBCUDF_TYPES = { pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, pa.date32(): type_id.TIMESTAMP_DAYS, + pa.null(): type_id.EMPTY, } LIBCUDF_TO_ARROW_TYPES = { @@ -245,7 +246,7 @@ def _to_arrow_datatype(cudf_object, **kwargs): return pa.list_(value_type) else: try: - return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + return LIBCUDF_TO_ARROW_TYPES[cudf_object.id()] except KeyError: raise TypeError( f"Unable to convert {cudf_object.id()} to arrow datatype" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 249cc3775f7..7187a36f21c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -484,32 +484,48 @@ def do_evaluate( return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsLastDistinct: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsUnique: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsDuplicated: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: name = columns[0].name @@ -717,7 +733,9 @@ def do_evaluate( bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY obj = plc.replace.replace_nulls( indices.obj, - plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()), + plc.interop.from_arrow( + pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type())) + ), ) else: bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK @@ -893,11 +911,13 @@ def _reduce( ) def _count(self, column: Column) -> Column: - # TODO: dtype handling return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(column.obj.size() - column.obj.null_count()), + pa.scalar( + column.obj.size() - column.obj.null_count(), + type=plc.interop.to_arrow(self.dtype), + ), ), 1, ), @@ -909,7 +929,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=self.dtype + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, ), @@ -924,7 +944,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=self.dtype + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, ), diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index d630b40f600..f8441b793b5 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -146,9 +146,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: assert_never(self.typ) if row_index is not None: name, offset = row_index - # TODO: dtype - step = plc.interop.from_arrow(pa.scalar(1)) - init = plc.interop.from_arrow(pa.scalar(offset)) + dtype = self.schema[name] + step = plc.interop.from_arrow( + pa.scalar(1, type=plc.interop.to_arrow(dtype)) + ) + init = plc.interop.from_arrow( + pa.scalar(offset, type=plc.interop.to_arrow(dtype)) + ) index = Column( plc.filling.sequence(df.num_rows, init, step), name ).set_sorted( diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index b3d0edf183f..9a301164beb 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -9,9 +9,11 @@ from functools import singledispatch from typing import Any +import pyarrow as pa + from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir -import cudf._lib.pylibcudf as plc # noqa: TCH002, singledispatch register needs this name defined. +import cudf._lib.pylibcudf as plc from cudf_polars.dsl import expr, ir from cudf_polars.utils import dtypes @@ -295,7 +297,8 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: - return expr.Literal(dtype, node.value) + value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) + return expr.Literal(dtype, value) @_translate_expr.register @@ -337,7 +340,7 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: inner = translate_expr(visitor, n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): - return expr.Literal(dtype, inner.value) + return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) else: return expr.Cast(dtype, inner) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 51379433c03..bede0de3c9f 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,6 +13,8 @@ import cudf._lib.pylibcudf as plc +__all__ = ["from_polars"] + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: @@ -84,6 +86,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) elif isinstance(dtype, pl.List): + # TODO: This doesn't consider the value type. return plc.DataType(plc.TypeId.LIST) else: raise NotImplementedError(f"{dtype=} conversion not supported") diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 49ecd7080b9..e50ee76a9b9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -134,7 +134,7 @@ ignore = [ fixable = ["ALL"] [tool.ruff.lint.per-file-ignores] -"**/tests/**/test_*.py" = ["D", "INP"] +"**/tests/**/*.py" = ["D"] [tool.ruff.lint.flake8-pytest-style] # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style diff --git a/python/cudf_polars/tests/__init__.py b/python/cudf_polars/tests/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/expressions/__init__.py b/python/cudf_polars/tests/expressions/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/expressions/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index c792ae64f74..645dbd26140 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -56,7 +56,7 @@ def test_agg(df, agg): q = df.select(expr) # https://github.com/rapidsai/cudf/issues/15852 - check_dtype = agg not in {"count", "n_unique", "median"} + check_dtype = agg not in {"n_unique", "median"} if not check_dtype and q.schema["a"] != pl.Float64: with pytest.raises(AssertionError): assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py new file mode 100644 index 00000000000..22865a7ce22 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_distinct.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"]) +def nullable(request): + return request.param + + +@pytest.fixture( + params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"] +) +def op(request): + return request.param + + +@pytest.fixture +def df(nullable): + values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1] + if nullable: + values[1] = None + values[4] = None + return pl.LazyFrame({"a": values}) + + +def test_expr_distinct(df, op): + expr = getattr(pl.col("a"), op)() + query = df.select(expr) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index b75e1bdef10..b2443e357e2 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -10,17 +10,7 @@ @pytest.fixture( - params=[ - (None, None), - pytest.param( - ("row-index", 0), - marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), - ), - pytest.param( - ("index", 10), - marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), - ), - ], + params=[(None, None), ("row-index", 0), ("index", 10)], ids=["no-row-index", "zero-offset-row-index", "offset-row-index"], ) def row_index(request):