Handle nans in groupby-aggregations in polars executor (#16233)

Polars `min` and `max` by default ignore nans (treating them as nulls), to mimic this behaviour we must mask out nans before performing a min/max aggregation. Do this by exposing `nans_to_nulls` in pylibcudf and implementing a `with_mask` method on pylibcudf Columns. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: #16233
rapidsai · Jul 12, 2024 · 4fc8e79 · 4fc8e79
1 parent 1ff7461
commit 4fc8e79
Show file tree

Hide file tree

Showing 18 changed files with 230 additions and 44 deletions.
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -34,6 +34,7 @@ This page provides API documentation for pylibcudf.
     stream_compaction
     table
     traits
+    transform
     types
     unary
 

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
@@ -0,0 +1,6 @@
+=========
+transform
+=========
+
+.. automodule:: cudf._lib.pylibcudf.transform
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -39,6 +39,7 @@ set(cython_sources
     sorting.pyx
     table.pyx
     traits.pyx
+    transform.pyx
     types.pyx
     unary.pyx
     utils.pyx

diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -24,6 +24,7 @@ from . cimport (
     stream_compaction,
     strings,
     traits,
+    transform,
     types,
     unary,
 )
@@ -63,6 +64,7 @@ __all__ = [
     "strings",
     "sorting",
     "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     stream_compaction,
     strings,
     traits,
+    transform,
     types,
     unary,
 )
@@ -64,6 +65,7 @@
     "strings",
     "sorting",
     "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -50,6 +50,7 @@ cdef class Column:
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
     cpdef Column copy(self)
+    cpdef Column with_mask(self, gpumemoryview, size_type)
 
     cpdef ListColumnView list_view(self)
 

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -175,6 +175,32 @@ cdef class Column:
             children,
         )
 
+    cpdef Column with_mask(self, gpumemoryview mask, size_type null_count):
+        """Augment this column with a new null mask.
+
+        Parameters
+        ----------
+        mask : gpumemoryview
+            New mask (or None to unset the mask)
+        null_count : int
+            New null count. If this is incorrect, bad things happen.
+
+        Returns
+        -------
+        New Column object sharing data with self (except for the mask which is new).
+        """
+        if mask is None and null_count > 0:
+            raise ValueError("Empty mask must have null count of zero")
+        return Column(
+            self._data_type,
+            self._size,
+            self._data,
+            mask,
+            null_count,
+            self._offset,
+            self._children,
+        )
+
     @staticmethod
     cdef Column from_column_view(const column_view& cv, Column owner):
         """Create a Column from a libcudf column_view.
@@ -250,7 +276,7 @@ cdef class Column:
         column is in use.
         """
         data = gpumemoryview(obj)
-        iface = data.__cuda_array_interface__()
+        iface = data.__cuda_array_interface__
         if iface.get('mask') is not None:
             raise ValueError("mask not yet supported.")
 
@@ -400,8 +426,8 @@ def is_c_contiguous(
     itemsize : int
         Size of an element in bytes.
 
-    Return
-    ------
+    Returns
+    -------
     bool
         The boolean answer.
     """

diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
@@ -22,5 +22,6 @@ cdef class gpumemoryview:
         # TODO: Need to respect readonly
         self.ptr = cai["data"][0]
 
+    @property
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move, pair
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
+    """Create a null mask preserving existing nulls and converting nans to null.
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    Two-tuple of a gpumemoryview wrapping the null mask and the new null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.nans_to_nulls(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
@@ -20,6 +20,7 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
 from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf cimport transform as plc_transform
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
@@ -82,18 +83,10 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    cdef column_view c_input = input.view()
-    cdef pair[unique_ptr[device_buffer], size_type] c_output
-    cdef unique_ptr[device_buffer] c_buffer
-
-    with nogil:
-        c_output = move(libcudf_transform.nans_to_nulls(c_input))
-        c_buffer = move(c_output.first)
-
-    if c_output.second == 0:
-        return None
-
-    return as_buffer(DeviceBuffer.c_from_unique_ptr(move(c_buffer)))
+    (mask, _) = plc_transform.nans_to_nulls(
+        input.to_pylibcudf(mode="read")
+    )
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()

diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -203,6 +203,15 @@ def sorted_opt(request):
     return request.param
 
 
-@pytest.fixture(scope="session", params=[False, True])
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nulls", "with_nulls"]
+)
 def has_nulls(request):
     return request.param
+
+
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nans", "with_nans"]
+)
+def has_nans(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/cudf/cudf/pylibcudf_tests/test_transform.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_nans_to_nulls(has_nans):
+    if has_nans:
+        values = [1, float("nan"), float("nan"), None, 3, None]
+    else:
+        values = [1, 4, 5, None, 3, None]
+
+    replaced = [
+        None if (v is None or (v is not None and math.isnan(v))) else v
+        for v in values
+    ]
+
+    h_input = pa.array(values, type=pa.float32())
+    input = plc.interop.from_arrow(h_input)
+    assert input.null_count() == h_input.null_count
+    expect = pa.array(replaced, type=pa.float32())
+
+    mask, null_count = plc.transform.nans_to_nulls(input)
+
+    assert null_count == expect.null_count
+    got = input.with_mask(mask, null_count)
+
+    assert_column_eq(expect, got)
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
@@ -128,24 +128,29 @@ def copy(self) -> Self:
         )
 
     def mask_nans(self) -> Self:
-        """Return a copy of self with nans masked out."""
-        if self.nan_count > 0:
-            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
+        """Return a shallow copy of self with nans masked out."""
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count))
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
         return self.copy()
 
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
-        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-            return 0
-        return plc.interop.to_arrow(
-            plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                # TODO: pylibcudf needs to have a SizeType DataType singleton
-                plc.DataType(plc.TypeId.INT32),
-            )
-        ).as_py()
+        if plc.traits.is_floating_point(self.obj.type()):
+            return plc.interop.to_arrow(
+                plc.reduce.reduce(
+                    plc.unary.is_nan(self.obj),
+                    plc.aggregation.sum(),
+                    # TODO: pylibcudf needs to have a SizeType DataType singleton
+                    plc.DataType(plc.TypeId.INT32),
+                )
+            ).as_py()
+        return 0
 
 
 class NamedColumn(Column):
@@ -187,3 +192,17 @@ def copy(self, *, new_name: str | None = None) -> Self:
             order=self.order,
             null_order=self.null_order,
         )
+
+    def mask_nans(self) -> Self:
+        """Return a shallow copy of self with nans masked out."""
+        # Annoying, the inheritance is not right (can't call the
+        # super-type mask_nans), but will sort that by refactoring
+        # later.
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count), self.name)
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
+        return self.copy()
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -867,7 +867,7 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("round", "unique"):
+        if self.name not in ("round", "unique", "mask_nans"):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -878,6 +878,9 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
         if self.name == "round":
             (decimal_places,) = self.options
             (values,) = (
@@ -1215,12 +1218,19 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             raise NotImplementedError(
                 "Nested aggregations in groupby"
             )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
         if self.request is None:
             raise NotImplementedError(
                 f"Aggregation {self.name} in groupby"
             )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
         return AggInfo([(expr, self.request, self)])
 
     def _reduce(

diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
@@ -3,12 +3,14 @@
 
 from __future__ import annotations
 
+from functools import partial
+
 import pyarrow
 import pytest
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, NamedColumn
 
 
 def test_non_scalar_access_raises():
@@ -54,17 +56,21 @@ def test_shallow_copy():
 
 
 @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
-def test_mask_nans(typeid):
+@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")])
+def test_mask_nans(typeid, constructor):
     dtype = plc.DataType(typeid)
     values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
-    column = Column(plc.interop.from_arrow(values))
+    column = constructor(plc.interop.from_arrow(values))
     masked = column.mask_nans()
-    assert column.obj is masked.obj
+    assert column.obj.null_count() == masked.obj.null_count()
 
 
-def test_mask_nans_float_with_nan_notimplemented():
+def test_mask_nans_float():
     dtype = plc.DataType(plc.TypeId.FLOAT32)
     values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype))
     column = Column(plc.interop.from_arrow(values))
-    with pytest.raises(NotImplementedError):
-        _ = column.mask_nans()
+    masked = column.mask_nans()
+    expect = pyarrow.array([0, 0, None], type=plc.interop.to_arrow(dtype))
+    got = pyarrow.array(plc.interop.to_arrow(masked.obj))
+
+    assert expect == got
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,7 @@ This page provides API documentation for pylibcudf. @@
         stream_compaction
         table
         traits
+        transform
         types
         unary
@@ Expand Down @@