rapidsai · rapids-bot · Jun 4, 2024 · May 22, 2024 · May 31, 2024 · Jun 4, 2024
@@ -55,6 +55,7 @@ ARROW_TO_PYLIBCUDF_TYPES = {
     pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS,
     pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS,
     pa.date32(): type_id.TIMESTAMP_DAYS,
+    pa.null(): type_id.EMPTY,
 }
 
 LIBCUDF_TO_ARROW_TYPES = {
@@ -245,7 +246,7 @@ def _to_arrow_datatype(cudf_object, **kwargs):
         return pa.list_(value_type)
     else:
         try:
-            return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()]
+            return LIBCUDF_TO_ARROW_TYPES[cudf_object.id()]
         except KeyError:
             raise TypeError(
                 f"Unable to convert {cudf_object.id()} to arrow datatype"

@@ -484,32 +484,48 @@ def do_evaluate(
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
             (column,) = columns
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.IsUnique:
             (column,) = columns
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.IsDuplicated:
             (column,) = columns
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.AllHorizontal:
             name = columns[0].name
@@ -717,7 +733,9 @@ def do_evaluate(
             bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
             obj = plc.replace.replace_nulls(
                 indices.obj,
-                plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()),
+                plc.interop.from_arrow(
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type()))
+                ),
             )
         else:
             bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
@@ -893,11 +911,13 @@ def _reduce(
         )
 
     def _count(self, column: Column) -> Column:
-        # TODO: dtype handling
         return Column(
             plc.Column.from_scalar(
                 plc.interop.from_arrow(
-                    pa.scalar(column.obj.size() - column.obj.null_count()),
+                    pa.scalar(
+                        column.obj.size() - column.obj.null_count(),
+                        type=plc.interop.to_arrow(self.dtype),
+                    ),
                 ),
                 1,
             ),
@@ -909,7 +929,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column:
             return Column(
                 plc.Column.from_scalar(
                     plc.interop.from_arrow(
-                        pa.scalar(float("nan")), data_type=self.dtype
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
                     ),
                     1,
                 ),
@@ -924,7 +944,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column:
             return Column(
                 plc.Column.from_scalar(
                     plc.interop.from_arrow(
-                        pa.scalar(float("nan")), data_type=self.dtype
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
                     ),
                     1,
                 ),

@@ -146,9 +146,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             assert_never(self.typ)
         if row_index is not None:
             name, offset = row_index
-            # TODO: dtype
-            step = plc.interop.from_arrow(pa.scalar(1))
-            init = plc.interop.from_arrow(pa.scalar(offset))
+            dtype = self.schema[name]
+            step = plc.interop.from_arrow(
+                pa.scalar(1, type=plc.interop.to_arrow(dtype))
+            )
+            init = plc.interop.from_arrow(
+                pa.scalar(offset, type=plc.interop.to_arrow(dtype))
+            )
             index = Column(
                 plc.filling.sequence(df.num_rows, init, step), name
             ).set_sorted(

@@ -9,9 +9,11 @@
 from functools import singledispatch
 from typing import Any
 
+import pyarrow as pa
+
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
-import cudf._lib.pylibcudf as plc  # noqa: TCH002, singledispatch register needs this name defined.
+import cudf._lib.pylibcudf as plc
 
 from cudf_polars.dsl import expr, ir
 from cudf_polars.utils import dtypes
@@ -295,7 +297,8 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr:
-    return expr.Literal(dtype, node.value)
+    value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
+    return expr.Literal(dtype, value)
 
 
 @_translate_expr.register
@@ -337,7 +340,7 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr:
     inner = translate_expr(visitor, n=node.expr)
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
-        return expr.Literal(dtype, inner.value)
+        return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
     else:
         return expr.Cast(dtype, inner)
 

@@ -13,6 +13,8 @@
 
 import cudf._lib.pylibcudf as plc
 
+__all__ = ["from_polars"]
+
 
 @cache
 def from_polars(dtype: pl.DataType) -> plc.DataType:
@@ -84,6 +86,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
     elif isinstance(dtype, pl.List):
+        # TODO: This doesn't consider the value type.
         return plc.DataType(plc.TypeId.LIST)
     else:
         raise NotImplementedError(f"{dtype=} conversion not supported")
@@ -134,7 +134,7 @@ ignore = [
 fixable = ["ALL"]
 
 [tool.ruff.lint.per-file-ignores]
-"**/tests/**/test_*.py" = ["D", "INP"]
+"**/tests/**/*.py" = ["D"]
 
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style

@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
@@ -56,7 +56,7 @@ def test_agg(df, agg):
     q = df.select(expr)
 
     # https://github.com/rapidsai/cudf/issues/15852
-    check_dtype = agg not in {"count", "n_unique", "median"}
+    check_dtype = agg not in {"n_unique", "median"}
     if not check_dtype and q.schema["a"] != pl.Float64:
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)

@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"])
+def nullable(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"]
+)
+def op(request):
+    return request.param
+
+
+@pytest.fixture
+def df(nullable):
+    values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1]
+    if nullable:
+        values[1] = None
+        values[4] = None
+    return pl.LazyFrame({"a": values})
+
+
+def test_expr_distinct(df, op):
+    expr = getattr(pl.col("a"), op)()
+    query = df.select(expr)
+    assert_gpu_result_equal(query)
@@ -10,17 +10,7 @@
 
 
 @pytest.fixture(
-    params=[
-        (None, None),
-        pytest.param(
-            ("row-index", 0),
-            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
-        ),
-        pytest.param(
-            ("index", 10),
-            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
-        ),
-    ],
+    params=[(None, None), ("row-index", 0), ("index", 10)],
     ids=["no-row-index", "zero-offset-row-index", "offset-row-index"],
 )
 def row_index(request):