From 9d4b08c17041cebcb121e297a3d334c290790e9c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:58:59 -0700 Subject: [PATCH 1/3] Define Column.nan_as_null to return self --- python/cudf/cudf/core/_base_index.py | 7 +---- python/cudf/cudf/core/column/categorical.py | 6 ++-- python/cudf/cudf/core/column/column.py | 14 +++++---- python/cudf/cudf/core/column/numerical.py | 6 ++-- .../cudf/cudf/core/column/numerical_base.py | 4 +-- python/cudf/cudf/core/indexed_frame.py | 29 ++++++------------- python/cudf/cudf/core/reshape.py | 4 +-- 7 files changed, 27 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e6868ae3431..ad6e9e73da3 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2072,12 +2072,7 @@ def dropna(self, how="any"): pass # This is to be consistent with IndexedFrame.dropna to handle nans # as nulls by default - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( drop_nulls( diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0ff8209dcd4..08bf97bf99d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -817,10 +817,8 @@ def to_pandas( .values_host ) - cats = col.categories - if cats.dtype.kind in "biuf": - cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] - elif not isinstance(cats.dtype, IntervalDtype): + cats = col.categories.nans_to_nulls() + if not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 59bae179497..972c904400d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -288,7 +288,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self) -> ColumnBase: + def dropna(self) -> Self: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -702,7 +702,9 @@ def fillna( Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=fill_value, method=method + input_col=self.nans_to_nulls(), + replacement=fill_value, + method=method, )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -1247,6 +1249,10 @@ def unary_operator(self, unaryop: str): f"Operation {unaryop} not supported for dtype {self.dtype}." ) + def nans_to_nulls(self: Self) -> Self: + """Convert NaN to NA.""" + return self + def normalize_binop_value( self, other: ScalarLike ) -> Union[ColumnBase, ScalarLike]: @@ -1809,9 +1815,7 @@ def as_column( data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) - if ( - nan_as_null or (mask is None and nan_as_null is None) - ) and col.dtype.kind == "f": + if nan_as_null or (mask is None and nan_as_null is None): col = col.nans_to_nulls() if dtype is not None: col = col.astype(dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index bab862f775f..2440f23275d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -536,7 +536,7 @@ def fillna( return col if method is not None: - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if fill_value is None: raise ValueError("Must specify either 'fill_value' or 'method'") @@ -545,7 +545,7 @@ def fillna( isinstance(fill_value, cudf.Scalar) and fill_value.dtype == col.dtype ): - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if np.isscalar(fill_value): # cast safely to the same dtype as self @@ -572,7 +572,7 @@ def fillna( else: fill_value = fill_value.astype(col.dtype) - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 541c32a2520..d38ec9cf30f 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 4: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 3: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a31430e1571..4f3e1ff11d5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True): results = {} for name, col in self._data.items(): if skipna: - try: - result_col = col.nans_to_nulls() - except AttributeError: - result_col = col + result_col = col.nans_to_nulls() else: if col.has_nulls(include_nan=True): first_index = col.isnull().find_first_value(True) @@ -1917,12 +1914,12 @@ def nans_to_nulls(self): 1 3.14 2 """ - result = ( - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col.copy() - for col in self._data.columns - ) + result = [] + for col in self._data.columns: + converted = col.nans_to_nulls() + if converted is col: + converted = converted.copy() + result.append(col) return self._from_data_like_self( self._data._from_columns_like_self(result) ) @@ -4230,10 +4227,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): thresh = len(df) for name, col in df._data.items(): - try: - check_col = col.nans_to_nulls() - except AttributeError: - check_col = col + check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count ) < thresh @@ -4263,12 +4257,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): if len(subset) == 0: return self.copy(deep=True) - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index d4772d5b4c2..53239cb7ea0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na): else: unique = column.unique().sort_values() if not dummy_na: - if np.issubdtype(unique.dtype, np.floating): - unique = unique.nans_to_nulls() - unique = unique.dropna() + unique = unique.nans_to_nulls().dropna() return unique From be66fde4162cd50307d985316f1897dc789cbd95 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:34:01 -0700 Subject: [PATCH 2/3] Add test verifying copy --- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/tests/test_series.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 1e27bcda4fc..d898eb4b9c3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1917,7 +1917,7 @@ def nans_to_nulls(self): converted = col.nans_to_nulls() if converted is col: converted = converted.copy() - result.append(col) + result.append(converted) return self._from_data_like_self( self._data._from_columns_like_self(result) ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 323716d5fc3..f47c42d9a1d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy(): ser1 = cudf.Series(range(3), index=list("abc")) ser2 = cudf.Series(ser1) assert ser1.index is ser2.index + + +@pytest.mark.parametrize("value", [1, 1.1]) +def test_nans_to_nulls_noop_copies_column(value): + ser1 = cudf.Series([value]) + ser2 = ser1.nans_to_nulls() + assert ser1._column is not ser2._column From 0d74275b228bc66f34b3852f9fa5e273e27a5114 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:42:24 -0700 Subject: [PATCH 3/3] Add fillna test with nan and null --- python/cudf/cudf/tests/test_replace.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d77ec596271..9466398964a 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex(): actual = gdf.fillna(10) assert_eq(expected, actual) + + +def test_fillna_nan_and_null(): + ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) + result = ser.fillna(2.2) + expected = cudf.Series([2.2, 2.2, 1.1]) + assert_eq(result, expected)