Define Column.nan_as_null to return self (#15923)

While trying to clean all the `fillna` logic, I needed to have a `Column.nan_as_null` defined to make the `fillna` logic more re-useable. This allows other `nan_as_null` usages in cudf to avoiding checking whether it's defined on the column or not. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #15923
rapidsai · Jun 7, 2024 · d83d086 · d83d086
1 parent 9bd16bb
commit d83d086
Show file tree

Hide file tree

Showing 9 changed files with 42 additions and 43 deletions.
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -2072,12 +2072,7 @@ def dropna(self, how="any"):
             pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
-        data_columns = [
-            col.nans_to_nulls()
-            if isinstance(col, cudf.core.column.NumericalColumn)
-            else col
-            for col in self._columns
-        ]
+        data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
             drop_nulls(

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -816,10 +816,8 @@ def to_pandas(
             .values_host
         )
 
-        cats = col.categories
-        if cats.dtype.kind in "biuf":
-            cats = cats.nans_to_nulls().dropna()  # type: ignore[attr-defined]
-        elif not isinstance(cats.dtype, IntervalDtype):
+        cats = col.categories.nans_to_nulls()
+        if not isinstance(cats.dtype, IntervalDtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -281,7 +281,7 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
-    def dropna(self) -> ColumnBase:
+    def dropna(self) -> Self:
         return drop_nulls([self])[0]._with_type_metadata(self.dtype)
 
     def to_arrow(self) -> pa.Array:
@@ -695,7 +695,9 @@ def fillna(
         Returns a copy with null filled.
         """
         return libcudf.replace.replace_nulls(
-            input_col=self, replacement=fill_value, method=method
+            input_col=self.nans_to_nulls(),
+            replacement=fill_value,
+            method=method,
         )._with_type_metadata(self.dtype)
 
     def isnull(self) -> ColumnBase:
@@ -1240,6 +1242,10 @@ def unary_operator(self, unaryop: str):
             f"Operation {unaryop} not supported for dtype {self.dtype}."
         )
 
+    def nans_to_nulls(self: Self) -> Self:
+        """Convert NaN to NA."""
+        return self
+
     def normalize_binop_value(
         self, other: ScalarLike
     ) -> Union[ColumnBase, ScalarLike]:
@@ -1802,9 +1808,7 @@ def as_column(
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
         col = build_column(data, dtype=arbitrary.dtype, mask=mask)
-        if (
-            nan_as_null or (mask is None and nan_as_null is None)
-        ) and col.dtype.kind == "f":
+        if nan_as_null or (mask is None and nan_as_null is None):
             col = col.nans_to_nulls()
         if dtype is not None:
             col = col.astype(dtype)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -536,7 +536,7 @@ def fillna(
             return col
 
         if method is not None:
-            return super(NumericalColumn, col).fillna(fill_value, method)
+            return super().fillna(fill_value, method)
 
         if fill_value is None:
             raise ValueError("Must specify either 'fill_value' or 'method'")
@@ -545,7 +545,7 @@ def fillna(
             isinstance(fill_value, cudf.Scalar)
             and fill_value.dtype == col.dtype
         ):
-            return super(NumericalColumn, col).fillna(fill_value, method)
+            return super().fillna(fill_value, method)
 
         if np.isscalar(fill_value):
             # cast safely to the same dtype as self
@@ -572,7 +572,7 @@ def fillna(
             else:
                 fill_value = fill_value.astype(col.dtype)
 
-        return super(NumericalColumn, col).fillna(fill_value, method)
+        return super().fillna(fill_value, method)
 
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
@@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float:
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        self = self.nans_to_nulls().dropna()  # type: ignore
+        self = self.nans_to_nulls().dropna()
 
         if len(self) < 4:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike:
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        self = self.nans_to_nulls().dropna()  # type: ignore
+        self = self.nans_to_nulls().dropna()
 
         if len(self) < 3:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True):
         results = {}
         for name, col in self._data.items():
             if skipna:
-                try:
-                    result_col = col.nans_to_nulls()
-                except AttributeError:
-                    result_col = col
+                result_col = col.nans_to_nulls()
             else:
                 if col.has_nulls(include_nan=True):
                     first_index = col.isnull().find_first_value(True)
@@ -1915,12 +1912,12 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        result = (
-            col.nans_to_nulls()
-            if isinstance(col, cudf.core.column.NumericalColumn)
-            else col.copy()
-            for col in self._data.columns
-        )
+        result = []
+        for col in self._data.columns:
+            converted = col.nans_to_nulls()
+            if converted is col:
+                converted = converted.copy()
+            result.append(converted)
         return self._from_data_like_self(
             self._data._from_columns_like_self(result)
         )
@@ -4228,10 +4225,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
                 thresh = len(df)
 
         for name, col in df._data.items():
-            try:
-                check_col = col.nans_to_nulls()
-            except AttributeError:
-                check_col = col
+            check_col = col.nans_to_nulls()
             no_threshold_valid_count = (
                 len(col) - check_col.null_count
             ) < thresh
@@ -4261,12 +4255,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
         if len(subset) == 0:
             return self.copy(deep=True)
 
-        data_columns = [
-            col.nans_to_nulls()
-            if isinstance(col, cudf.core.column.NumericalColumn)
-            else col
-            for col in self._columns
-        ]
+        data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
@@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na):
     else:
         unique = column.unique().sort_values()
     if not dummy_na:
-        if np.issubdtype(unique.dtype, np.floating):
-            unique = unique.nans_to_nulls()
-        unique = unique.dropna()
+        unique = unique.nans_to_nulls().dropna()
     return unique
 
 

diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex():
     actual = gdf.fillna(10)
 
     assert_eq(expected, actual)
+
+
+def test_fillna_nan_and_null():
+    ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False)
+    result = ser.fillna(2.2)
+    expected = cudf.Series([2.2, 2.2, 1.1])
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy():
     ser1 = cudf.Series(range(3), index=list("abc"))
     ser2 = cudf.Series(ser1)
     assert ser1.index is ser2.index
+
+
+@pytest.mark.parametrize("value", [1, 1.1])
+def test_nans_to_nulls_noop_copies_column(value):
+    ser1 = cudf.Series([value])
+    ser2 = ser1.nans_to_nulls()
+    assert ser1._column is not ser2._column