From d83d086afda1d25f5711a0aecf4ecfe6c05f7b9d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jun 2024 07:30:32 -1000 Subject: [PATCH] Define Column.nan_as_null to return self (#15923) While trying to clean all the `fillna` logic, I needed to have a `Column.nan_as_null` defined to make the `fillna` logic more re-useable. This allows other `nan_as_null` usages in cudf to avoiding checking whether it's defined on the column or not. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15923 --- python/cudf/cudf/core/_base_index.py | 7 +---- python/cudf/cudf/core/column/categorical.py | 6 ++-- python/cudf/cudf/core/column/column.py | 14 +++++---- python/cudf/cudf/core/column/numerical.py | 6 ++-- .../cudf/cudf/core/column/numerical_base.py | 4 +-- python/cudf/cudf/core/indexed_frame.py | 29 ++++++------------- python/cudf/cudf/core/reshape.py | 4 +-- python/cudf/cudf/tests/test_replace.py | 8 +++++ python/cudf/cudf/tests/test_series.py | 7 +++++ 9 files changed, 42 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index baca7b19e58..5d0f7c4ede4 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2072,12 +2072,7 @@ def dropna(self, how="any"): pass # This is to be consistent with IndexedFrame.dropna to handle nans # as nulls by default - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( drop_nulls( diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 1828c5ce97b..de20b2ace1d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -816,10 +816,8 @@ def to_pandas( .values_host ) - cats = col.categories - if cats.dtype.kind in "biuf": - cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] - elif not isinstance(cats.dtype, IntervalDtype): + cats = col.categories.nans_to_nulls() + if not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 68079371b85..475d52d0fbb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -281,7 +281,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self) -> ColumnBase: + def dropna(self) -> Self: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -695,7 +695,9 @@ def fillna( Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=fill_value, method=method + input_col=self.nans_to_nulls(), + replacement=fill_value, + method=method, )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -1240,6 +1242,10 @@ def unary_operator(self, unaryop: str): f"Operation {unaryop} not supported for dtype {self.dtype}." ) + def nans_to_nulls(self: Self) -> Self: + """Convert NaN to NA.""" + return self + def normalize_binop_value( self, other: ScalarLike ) -> Union[ColumnBase, ScalarLike]: @@ -1802,9 +1808,7 @@ def as_column( data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) - if ( - nan_as_null or (mask is None and nan_as_null is None) - ) and col.dtype.kind == "f": + if nan_as_null or (mask is None and nan_as_null is None): col = col.nans_to_nulls() if dtype is not None: col = col.astype(dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index fb413959eb9..6fb4f17b76d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -536,7 +536,7 @@ def fillna( return col if method is not None: - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if fill_value is None: raise ValueError("Must specify either 'fill_value' or 'method'") @@ -545,7 +545,7 @@ def fillna( isinstance(fill_value, cudf.Scalar) and fill_value.dtype == col.dtype ): - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if np.isscalar(fill_value): # cast safely to the same dtype as self @@ -572,7 +572,7 @@ def fillna( else: fill_value = fill_value.astype(col.dtype) - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 541c32a2520..d38ec9cf30f 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 4: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 3: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ecfcec15337..d898eb4b9c3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True): results = {} for name, col in self._data.items(): if skipna: - try: - result_col = col.nans_to_nulls() - except AttributeError: - result_col = col + result_col = col.nans_to_nulls() else: if col.has_nulls(include_nan=True): first_index = col.isnull().find_first_value(True) @@ -1915,12 +1912,12 @@ def nans_to_nulls(self): 1 3.14 2 """ - result = ( - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col.copy() - for col in self._data.columns - ) + result = [] + for col in self._data.columns: + converted = col.nans_to_nulls() + if converted is col: + converted = converted.copy() + result.append(converted) return self._from_data_like_self( self._data._from_columns_like_self(result) ) @@ -4228,10 +4225,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): thresh = len(df) for name, col in df._data.items(): - try: - check_col = col.nans_to_nulls() - except AttributeError: - check_col = col + check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count ) < thresh @@ -4261,12 +4255,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): if len(subset) == 0: return self.copy(deep=True) - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index d4772d5b4c2..53239cb7ea0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na): else: unique = column.unique().sort_values() if not dummy_na: - if np.issubdtype(unique.dtype, np.floating): - unique = unique.nans_to_nulls() - unique = unique.dropna() + unique = unique.nans_to_nulls().dropna() return unique diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d77ec596271..9466398964a 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex(): actual = gdf.fillna(10) assert_eq(expected, actual) + + +def test_fillna_nan_and_null(): + ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) + result = ser.fillna(2.2) + expected = cudf.Series([2.2, 2.2, 1.1]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 323716d5fc3..f47c42d9a1d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy(): ser1 = cudf.Series(range(3), index=list("abc")) ser2 = cudf.Series(ser1) assert ser1.index is ser2.index + + +@pytest.mark.parametrize("value", [1, 1.1]) +def test_nans_to_nulls_noop_copies_column(value): + ser1 = cudf.Series([value]) + ser2 = ser1.nans_to_nulls() + assert ser1._column is not ser2._column