From fbc68afbf12f4b471c36bf6c33c03f09a3a315cb Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Jul 2024 14:51:37 +0000 Subject: [PATCH 1/9] cuda array interface is a property --- python/cudf/cudf/_lib/pylibcudf/column.pyx | 2 +- python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index e0cf8b7ee32..cec9b11bb7e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -250,7 +250,7 @@ cdef class Column: column is in use. """ data = gpumemoryview(obj) - iface = data.__cuda_array_interface__() + iface = data.__cuda_array_interface__ if iface.get('mask') is not None: raise ValueError("mask not yet supported.") diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx index a2f5b2ac387..0904022a944 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx @@ -22,5 +22,6 @@ cdef class gpumemoryview: # TODO: Need to respect readonly self.ptr = cai["data"][0] + @property def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ From 2937dbc54453c7cd4867d1e546ac30c13dea59da Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Jul 2024 14:51:57 +0000 Subject: [PATCH 2/9] Allow updating a Column with a new null mask --- python/cudf/cudf/_lib/pylibcudf/column.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/column.pyx | 26 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index d13791d95cf..13ee0a70681 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -50,6 +50,7 @@ cdef class Column: cpdef gpumemoryview null_mask(self) cpdef list children(self) cpdef Column copy(self) + cpdef Column with_mask(self, gpumemoryview, size_type) cpdef ListColumnView list_view(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index cec9b11bb7e..4ad3637cc19 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -175,6 +175,32 @@ cdef class Column: children, ) + cpdef Column with_mask(self, gpumemoryview mask, size_type null_count): + """Augment this column with a new null mask. + + Parameters + ---------- + mask + New mask (or None to unset the mask) + null_count + New null count. If this is incorrect, bad things happen. + + Returns + ------- + New Column object sharing data with self (except for the mask which is new). + """ + if mask is None and null_count > 0: + raise ValueError("Empty mask must have null count of zero") + return Column( + self._data_type, + self._size, + self._data, + mask, + null_count, + self._offset, + self._children, + ) + @staticmethod cdef Column from_column_view(const column_view& cv, Column owner): """Create a Column from a libcudf column_view. From 1f7735df79d1799a42b0ae2c07aaa8653f709fbf Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Jul 2024 14:53:18 +0000 Subject: [PATCH 3/9] Start porting transform.hpp to pylibcudf For now just nans_to_nulls. --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/transform.rst | 6 ++++ .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 ++ python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 ++ python/cudf/cudf/_lib/pylibcudf/transform.pxd | 7 ++++ python/cudf/cudf/_lib/pylibcudf/transform.pyx | 35 +++++++++++++++++++ 7 files changed, 54 insertions(+) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/transform.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/transform.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index bd6f0f77357..5899d272160 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -34,6 +34,7 @@ This page provides API documentation for pylibcudf. stream_compaction table traits + transform types unary diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst new file mode 100644 index 00000000000..ef04bbad7e6 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst @@ -0,0 +1,6 @@ +========= +transform +========= + +.. automodule:: cudf._lib.pylibcudf.transform + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index d22096081af..a2d11bbea6e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -39,6 +39,7 @@ set(cython_sources sorting.pyx table.pyx traits.pyx + transform.pyx types.pyx unary.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index d4d615cde34..da2b7806203 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -24,6 +24,7 @@ from . cimport ( stream_compaction, strings, traits, + transform, types, unary, ) @@ -63,6 +64,7 @@ __all__ = [ "strings", "sorting", "traits", + "transform", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 91f8acaf682..acbc84d7177 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -24,6 +24,7 @@ stream_compaction, strings, traits, + transform, types, unary, ) @@ -64,6 +65,7 @@ "strings", "sorting", "traits", + "transform", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/transform.pxd new file mode 100644 index 00000000000..4b21feffe25 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/transform.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .gpumemoryview cimport gpumemoryview + + +cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx new file mode 100644 index 00000000000..bf18c6a91a5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/transform.pyx @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move, pair + +from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer + +from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform +from cudf._lib.pylibcudf.libcudf.types cimport size_type + +from .column cimport Column +from .gpumemoryview cimport gpumemoryview + + +cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): + """Create a null mask preserving existing nulls and converting nans to null. + + Parameters + ---------- + input + Column to produce new mask from. + + Returns + ------- + Two-tuple of a gpumemoryview wrapping the null mask and the new null count. + """ + cdef pair[unique_ptr[device_buffer], size_type] c_result + + with nogil: + c_result = move(cpp_transform.nans_to_nulls(input.view())) + + return ( + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + c_result.second + ) From c0de6a1901773388a2dc182e6f60e442fba2d030 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Jul 2024 15:00:22 +0000 Subject: [PATCH 4/9] Use newly wrapped nans_to_nulls --- python/cudf/cudf/_lib/transform.pyx | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index b325173f20d..86a4a60eef1 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -20,6 +20,7 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform from cudf._lib.column cimport Column from cudf._lib.expressions cimport Expression +from cudf._lib.pylibcudf cimport transform as plc_transform from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.expressions cimport expression @@ -82,18 +83,10 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): @acquire_spill_lock() def nans_to_nulls(Column input): - cdef column_view c_input = input.view() - cdef pair[unique_ptr[device_buffer], size_type] c_output - cdef unique_ptr[device_buffer] c_buffer - - with nogil: - c_output = move(libcudf_transform.nans_to_nulls(c_input)) - c_buffer = move(c_output.first) - - if c_output.second == 0: - return None - - return as_buffer(DeviceBuffer.c_from_unique_ptr(move(c_buffer))) + (mask, _) = plc_transform.nans_to_nulls( + input.to_pylibcudf(mode="read") + ) + return as_buffer(mask) @acquire_spill_lock() From f0a254eff2180935983a6c4fb2d96b8d70cecd49 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Jul 2024 17:22:29 +0000 Subject: [PATCH 5/9] Annoying --- .../cudf_polars/containers/column.py | 45 +++++++++++++------ .../tests/containers/test_column.py | 9 ++-- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 28685f0c4ed..af67059844e 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -128,24 +128,29 @@ def copy(self) -> Self: ) def mask_nans(self) -> Self: - """Return a copy of self with nans masked out.""" - if self.nan_count > 0: - raise NotImplementedError("Need to port transform.hpp to pylibcudf") + """Return a shallow copy of self with nans masked out.""" + if plc.traits.is_floating_point(self.obj.type()): + old_count = self.obj.null_count() + mask, new_count = plc.transform.nans_to_nulls(self.obj) + result = type(self)(self.obj.with_mask(mask, new_count)) + if old_count == new_count: + return result.sorted_like(self) + return result return self.copy() @functools.cached_property def nan_count(self) -> int: """Return the number of NaN values in the column.""" - if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): - return 0 - return plc.interop.to_arrow( - plc.reduce.reduce( - plc.unary.is_nan(self.obj), - plc.aggregation.sum(), - # TODO: pylibcudf needs to have a SizeType DataType singleton - plc.DataType(plc.TypeId.INT32), - ) - ).as_py() + if plc.traits.is_floating_point(self.obj.type()): + return plc.interop.to_arrow( + plc.reduce.reduce( + plc.unary.is_nan(self.obj), + plc.aggregation.sum(), + # TODO: pylibcudf needs to have a SizeType DataType singleton + plc.DataType(plc.TypeId.INT32), + ) + ).as_py() + return 0 class NamedColumn(Column): @@ -187,3 +192,17 @@ def copy(self, *, new_name: str | None = None) -> Self: order=self.order, null_order=self.null_order, ) + + def mask_nans(self) -> Self: + """Return a shallow copy of self with nans masked out.""" + # Annoying, the inheritance is not right (can't call the + # super-type mask_nans), but will sort that by refactoring + # later. + if plc.traits.is_floating_point(self.obj.type()): + old_count = self.obj.null_count() + mask, new_count = plc.transform.nans_to_nulls(self.obj) + result = type(self)(self.obj.with_mask(mask, new_count), self.name) + if old_count == new_count: + return result.sorted_like(self) + return result + return self.copy() diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 3291d8db161..3f05674dc5a 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -3,12 +3,14 @@ from __future__ import annotations +from functools import partial + import pyarrow import pytest import cudf._lib.pylibcudf as plc -from cudf_polars.containers import Column +from cudf_polars.containers import Column, NamedColumn def test_non_scalar_access_raises(): @@ -54,10 +56,11 @@ def test_shallow_copy(): @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) -def test_mask_nans(typeid): +@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")]) +def test_mask_nans(typeid, constructor): dtype = plc.DataType(typeid) values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) - column = Column(plc.interop.from_arrow(values)) + column = constructor(plc.interop.from_arrow(values)) masked = column.mask_nans() assert column.obj is masked.obj From cff4fdc12b25ffb5441705a090dde91daf2e3fda Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Jul 2024 17:23:20 +0000 Subject: [PATCH 6/9] Handle nans for nan-ignoring aggs in groupby-agg --- python/cudf_polars/cudf_polars/dsl/expr.py | 12 ++++++++- .../tests/containers/test_column.py | 11 +++++--- .../cudf_polars/tests/expressions/test_agg.py | 25 +++++++++++++------ python/cudf_polars/tests/test_groupby.py | 24 ++++++++++++++++++ 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index f83d9e82d30..adf266bab81 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -867,7 +867,7 @@ def __init__( self.name = name self.options = options self.children = children - if self.name not in ("round", "unique"): + if self.name not in ("round", "unique", "mask_nans"): raise NotImplementedError(f"Unary function {name=}") def do_evaluate( @@ -878,6 +878,9 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" + if self.name == "mask_nans": + (child,) = self.children + return child.evaluate(df, context=context, mapping=mapping).mask_nans() if self.name == "round": (decimal_places,) = self.options (values,) = ( @@ -1215,12 +1218,19 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError( "Nested aggregations in groupby" ) # pragma: no cover; check_agg trips first + if (isminmax := self.name in {"min", "max"}) and self.options: + raise NotImplementedError("Nan propagation in groupby for min/max") (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests if self.request is None: raise NotImplementedError( f"Aggregation {self.name} in groupby" ) # pragma: no cover; __init__ trips first + if isminmax and plc.traits.is_floating_point(self.dtype): + assert expr is not None + # Ignore nans in these groupby aggs, do this by masking + # nans in the input + expr = UnaryFunction(self.dtype, "mask_nans", (), expr) return AggInfo([(expr, self.request, self)]) def _reduce( diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 3f05674dc5a..4f3c0de5975 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -62,12 +62,15 @@ def test_mask_nans(typeid, constructor): values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) column = constructor(plc.interop.from_arrow(values)) masked = column.mask_nans() - assert column.obj is masked.obj + assert column.obj.null_count() == masked.obj.null_count() -def test_mask_nans_float_with_nan_notimplemented(): +def test_mask_nans_float(): dtype = plc.DataType(plc.TypeId.FLOAT32) values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype)) column = Column(plc.interop.from_arrow(values)) - with pytest.raises(NotImplementedError): - _ = column.mask_nans() + masked = column.mask_nans() + expect = pyarrow.array([0, 0, None], type=plc.interop.to_arrow(dtype)) + got = pyarrow.array(plc.interop.to_arrow(masked.obj)) + + assert expect == got diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 267d0a99692..e53fd7f8615 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -59,14 +59,25 @@ def test_agg(df, agg): @pytest.mark.parametrize( - "propagate_nans", - [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True], - ids=["mask_nans", "propagate_nans"], + "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max] ) -@pytest.mark.parametrize("op", ["min", "max"]) -def test_agg_float_with_nans(propagate_nans, op): - df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())}) - op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op) +def test_agg_float_with_nans(op): + df = pl.LazyFrame( + { + "a": pl.Series([1, 2, float("nan")], dtype=pl.Float64()), + "b": pl.Series([1, 2, None], dtype=pl.Int8()), + } + ) + q = df.select(op(pl.col("a")), op(pl.col("b"))) + + assert_gpu_result_equal(q) + + +@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") +@pytest.mark.parametrize("op", [pl.Expr.max, pl.Expr.min]) +def test_agg_singleton(op): + df = pl.LazyFrame({"a": pl.Series([float("nan")])}) + q = df.select(op(pl.col("a"))) assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index b84e2c16b43..81306397b9f 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -99,3 +99,27 @@ def test_groupby_unsupported(df, expr): q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") +def test_groupby_minmax_with_nan(): + df = pl.LazyFrame( + {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]} + ) + + q = df.group_by("key").agg( + pl.col("value").max().alias("max"), pl.col("value").min().alias("min") + ) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("op", [pl.Expr.nan_max, pl.Expr.nan_min]) +def test_groupby_nan_minmax_raises(op): + df = pl.LazyFrame( + {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]} + ) + + q = df.group_by("key").agg(op(pl.col("value"))) + + assert_ir_translation_raises(q, NotImplementedError) From ceff6af561cc3d5969306515c585c1259471b143 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Jul 2024 08:26:21 +0000 Subject: [PATCH 7/9] Fix docstring section --- python/cudf/cudf/_lib/pylibcudf/column.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 4ad3637cc19..c7454560c01 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -426,8 +426,8 @@ def is_c_contiguous( itemsize : int Size of an element in bytes. - Return - ------ + Returns + ------- bool The boolean answer. """ From 15802e671e38e2dab49bd930ac59ee20e1a85178 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Jul 2024 14:12:24 +0000 Subject: [PATCH 8/9] Docstring type annotations --- python/cudf/cudf/_lib/pylibcudf/column.pyx | 4 ++-- python/cudf/cudf/_lib/pylibcudf/transform.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index c7454560c01..cb96c1d9fce 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -180,9 +180,9 @@ cdef class Column: Parameters ---------- - mask + mask : gpumemoryview New mask (or None to unset the mask) - null_count + null_count : int New null count. If this is incorrect, bad things happen. Returns diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx index bf18c6a91a5..a734e71b820 100644 --- a/python/cudf/cudf/_lib/pylibcudf/transform.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/transform.pyx @@ -17,7 +17,7 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): Parameters ---------- - input + input : Column Column to produce new mask from. Returns From cb2a5a4c54bd33a142832b3f4641e49cea57be68 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Jul 2024 14:24:57 +0000 Subject: [PATCH 9/9] Test nans_to_nulls in pylibcudf --- python/cudf/cudf/pylibcudf_tests/conftest.py | 11 ++++++- .../cudf/pylibcudf_tests/test_transform.py | 32 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_transform.py diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 39832eb4bba..b5bbf470eaf 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -141,6 +141,15 @@ def sorted_opt(request): return request.param -@pytest.fixture(scope="session", params=[False, True]) +@pytest.fixture( + scope="session", params=[False, True], ids=["without_nulls", "with_nulls"] +) def has_nulls(request): return request.param + + +@pytest.fixture( + scope="session", params=[False, True], ids=["without_nans", "with_nans"] +) +def has_nans(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/cudf/cudf/pylibcudf_tests/test_transform.py new file mode 100644 index 00000000000..312939888dd --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_transform.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import math + +import pyarrow as pa +from utils import assert_column_eq + +from cudf._lib import pylibcudf as plc + + +def test_nans_to_nulls(has_nans): + if has_nans: + values = [1, float("nan"), float("nan"), None, 3, None] + else: + values = [1, 4, 5, None, 3, None] + + replaced = [ + None if (v is None or (v is not None and math.isnan(v))) else v + for v in values + ] + + h_input = pa.array(values, type=pa.float32()) + input = plc.interop.from_arrow(h_input) + assert input.null_count() == h_input.null_count + expect = pa.array(replaced, type=pa.float32()) + + mask, null_count = plc.transform.nans_to_nulls(input) + + assert null_count == expect.null_count + got = input.with_mask(mask, null_count) + + assert_column_eq(expect, got)