From bc08662fd6c08635af78faaf4bc8a909f85a3f8a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Jun 2024 07:37:19 -1000 Subject: [PATCH] Refactor fillna logic to push specifics toward Frame subclasses and Column subclasses (#15957) Essentially 2 reorganizations 1. `Frame.fillna` input argument logic was pushed toward its subclasses `Series`/`DataFrame`/`IndexedFrame` where appripriate 2. `Column.fillna` was made generic. Column subclasses now implement `_validate_fillna_value` used by `Column.fillna` to validate the fill value Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15957 --- python/cudf/cudf/core/column/categorical.py | 79 ++++++++------------ python/cudf/cudf/core/column/column.py | 21 +++++- python/cudf/cudf/core/column/datetime.py | 21 +----- python/cudf/cudf/core/column/decimal.py | 39 ++++------ python/cudf/cudf/core/column/numerical.py | 63 ++++------------ python/cudf/cudf/core/column/string.py | 18 +---- python/cudf/cudf/core/column/timedelta.py | 19 +---- python/cudf/cudf/core/dataframe.py | 26 +++++++ python/cudf/cudf/core/frame.py | 81 ++++++++------------- python/cudf/cudf/core/indexed_frame.py | 23 ------ python/cudf/cudf/core/series.py | 14 +--- python/cudf/cudf/tests/test_series.py | 12 +++ 12 files changed, 155 insertions(+), 261 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f538180805b..231af30c06d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1068,51 +1068,34 @@ def notnull(self) -> ColumnBase: return result - def fillna( - self, - fill_value: Any = None, - method: str | None = None, - ) -> Self: - """ - Fill null values with *fill_value* - """ - if fill_value is not None: - fill_is_scalar = np.isscalar(fill_value) - - if fill_is_scalar: - if fill_value == _DEFAULT_CATEGORICAL_VALUE: - fill_value = self.codes.dtype.type(fill_value) - else: - try: - fill_value = self._encode(fill_value) - fill_value = self.codes.dtype.type(fill_value) - except ValueError as err: - err_msg = "fill value must be in categories" - raise ValueError(err_msg) from err + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> cudf.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" + if cudf.api.types.is_scalar(fill_value): + if fill_value != _DEFAULT_CATEGORICAL_VALUE: + try: + fill_value = self._encode(fill_value) + except ValueError as err: + raise ValueError( + f"{fill_value=} must be in categories" + ) from err + return cudf.Scalar(fill_value, dtype=self.codes.dtype) + else: + fill_value = column.as_column(fill_value, nan_as_null=False) + if isinstance(fill_value.dtype, CategoricalDtype): + if self.dtype != fill_value.dtype: + raise TypeError( + "Cannot set a categorical with another without identical categories" + ) else: - fill_value = column.as_column(fill_value, nan_as_null=False) - if isinstance(fill_value, CategoricalColumn): - if self.dtype != fill_value.dtype: - raise TypeError( - "Cannot set a Categorical with another, " - "without identical categories" - ) - # TODO: only required if fill_value has a subset of the - # categories: - fill_value = fill_value._set_categories( - self.categories, - is_unique=True, - ) - fill_value = column.as_column(fill_value.codes).astype( - self.codes.dtype + raise TypeError( + "Cannot set a categorical with non-categorical data" ) - - # Validation of `fill_value` will have to be performed - # before returning self. - if not self.nullable: - return self - - return super().fillna(fill_value, method=method) + fill_value = fill_value._set_categories( + self.categories, + ) + return fill_value.codes.astype(self.codes.dtype) def indices_of( self, value: ScalarLike @@ -1372,11 +1355,13 @@ def _set_categories( if not (is_unique or new_cats.is_unique): new_cats = cudf.Series(new_cats)._column.unique() + if cur_cats.equals(new_cats, check_dtypes=True): + # TODO: Internal usages don't always need a copy; add a copy keyword + # as_ordered shallow copies + return self.copy().as_ordered(ordered=ordered) + cur_codes = self.codes - max_cat_size = ( - len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) - ) - out_code_dtype = min_unsigned_type(max_cat_size) + out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats))) cur_order = column.as_column(range(len(cur_codes))) old_codes = column.as_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 586689e2ee3..dfcdfbb9d91 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -666,15 +666,32 @@ def _check_scatter_key_length( f"{num_keys}" ) + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> cudf.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" + if is_scalar(fill_value): + return cudf.Scalar(fill_value, dtype=self.dtype) + return as_column(fill_value) + def fillna( self, - fill_value: Any = None, - method: str | None = None, + fill_value: ScalarLike | ColumnLike, + method: Literal["ffill", "bfill", None] = None, ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ + if not self.has_nulls(include_nan=True): + return self.copy() + elif method is None: + if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar( + fill_value + ): + return self.copy() + else: + fill_value = self._validate_fillna_value(fill_value) return libcudf.replace.replace_nulls( input_col=self.nans_to_nulls(), replacement=fill_value, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d88553361dd..121076b69ce 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,18 +8,17 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Any, Literal, Sequence, cast +from typing import TYPE_CHECKING, Literal, Sequence, cast import numpy as np import pandas as pd import pyarrow as pa -from typing_extensions import Self import cudf from cudf import _lib as libcudf from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted -from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype +from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, @@ -641,22 +640,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: else: return result_col - def fillna( - self, - fill_value: Any = None, - method: str | None = None, - ) -> Self: - if fill_value is not None: - if cudf.utils.utils._isnat(fill_value): - return self.copy(deep=True) - if is_scalar(fill_value): - if not isinstance(fill_value, cudf.Scalar): - fill_value = cudf.Scalar(fill_value, dtype=self.dtype) - else: - fill_value = column.as_column(fill_value, nan_as_null=False) - - return super().fillna(fill_value, method) - def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index e9d9b4933e5..d66908b5f94 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,12 +4,11 @@ import warnings from decimal import Decimal -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Sequence, cast import cupy as cp import numpy as np import pyarrow as pa -from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -31,7 +30,7 @@ from .numerical_base import NumericalBaseColumn if TYPE_CHECKING: - from cudf._typing import ColumnBinaryOperand, Dtype + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike class DecimalBaseColumn(NumericalBaseColumn): @@ -135,30 +134,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): return result - def fillna( - self, - fill_value: Any = None, - method: str | None = None, - ) -> Self: - """Fill null values with ``value``. - - Returns a copy with null filled. - """ + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> cudf.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" if isinstance(fill_value, (int, Decimal)): - fill_value = cudf.Scalar(fill_value, dtype=self.dtype) - elif ( - isinstance(fill_value, DecimalBaseColumn) - or isinstance(fill_value, cudf.core.column.NumericalColumn) - and is_integer_dtype(fill_value.dtype) + return cudf.Scalar(fill_value, dtype=self.dtype) + elif isinstance(fill_value, ColumnBase) and ( + isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu" ): - fill_value = fill_value.astype(self.dtype) - else: - raise TypeError( - "Decimal columns only support using fillna with decimal and " - "integer values" - ) - - return super().fillna(fill_value, method=method) + return fill_value.astype(self.dtype) + raise TypeError( + "Decimal columns only support using fillna with decimal and " + "integer values" + ) def normalize_binop_value(self, other): if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 098cf43421b..76c64e1aea0 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -532,57 +532,26 @@ def find_and_replace( replaced, df._data["old"], df._data["new"] ) - def fillna( - self, - fill_value: Any = None, - method: str | None = None, - ) -> Self: - """ - Fill null values with *fill_value* - """ - col = self.nans_to_nulls() - - if col.null_count == 0: - return col - - if method is not None: - return super().fillna(fill_value, method) - - if fill_value is None: - raise ValueError("Must specify either 'fill_value' or 'method'") - - if ( - isinstance(fill_value, cudf.Scalar) - and fill_value.dtype == col.dtype - ): - return super().fillna(fill_value, method) - - if np.isscalar(fill_value): - # cast safely to the same dtype as self - fill_value_casted = col.dtype.type(fill_value) - if not np.isnan(fill_value) and (fill_value_casted != fill_value): + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> cudf.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" + if is_scalar(fill_value): + cudf_obj = cudf.Scalar(fill_value) + if not as_column(cudf_obj).can_cast_safely(self.dtype): raise TypeError( f"Cannot safely cast non-equivalent " - f"{type(fill_value).__name__} to {col.dtype.name}" + f"{type(fill_value).__name__} to {self.dtype.name}" ) - fill_value = cudf.Scalar(fill_value_casted) else: - fill_value = column.as_column(fill_value, nan_as_null=False) - if is_integer_dtype(col.dtype): - # cast safely to the same dtype as self - if fill_value.dtype != col.dtype: - new_fill_value = fill_value.astype(col.dtype) - if not (new_fill_value == fill_value).all(): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{fill_value.dtype.type.__name__} to " - f"{col.dtype.type.__name__}" - ) - fill_value = new_fill_value - else: - fill_value = fill_value.astype(col.dtype) - - return super().fillna(fill_value, method) + cudf_obj = as_column(fill_value, nan_as_null=False) + if not cudf_obj.can_cast_safely(self.dtype): # type: ignore[attr-defined] + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{cudf_obj.dtype.type.__name__} to " + f"{self.dtype.type.__name__}" + ) + return cudf_obj.astype(self.dtype) def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2451a9cc0af..936cd1eccb0 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,12 +5,11 @@ import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Sequence, cast, overload +from typing import TYPE_CHECKING, Sequence, cast, overload import numpy as np import pandas as pd import pyarrow as pa -from typing_extensions import Self import cudf import cudf.api.types @@ -5838,21 +5837,6 @@ def find_and_replace( res = self return libcudf.replace.replace(res, df._data["old"], df._data["new"]) - def fillna( - self, - fill_value: Any = None, - method: str | None = None, - ) -> Self: - if fill_value is not None: - if not is_scalar(fill_value): - fill_value = column.as_column(fill_value, dtype=self.dtype) - elif cudf._lib.scalar._is_null_host_scalar(fill_value): - # Trying to fill with value? Return copy. - return self.copy(deep=True) - else: - fill_value = cudf.Scalar(fill_value, dtype=self.dtype) - return super().fillna(fill_value, method=method) - def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: if ( isinstance(other, (column.ColumnBase, cudf.Scalar)) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 26b449f1863..8f41bcb6422 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,12 +4,11 @@ import datetime import functools -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Sequence, cast import numpy as np import pandas as pd import pyarrow as pa -from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -252,22 +251,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] - def fillna( - self, - fill_value: Any = None, - method: str | None = None, - ) -> Self: - if fill_value is not None: - if cudf.utils.utils._isnat(fill_value): - return self.copy(deep=True) - if is_scalar(fill_value): - fill_value = cudf.Scalar(fill_value) - dtype = self.dtype - fill_value = fill_value.astype(dtype) - else: - fill_value = column.as_column(fill_value, nan_as_null=False) - return super().fillna(fill_value, method) - def as_numerical_column( self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 76bb9d2a8ed..f0d8157011d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2980,6 +2980,32 @@ def set_index( df.index = idx return df if not inplace else None + @_cudf_nvtx_annotate + def fillna( + self, value=None, method=None, axis=None, inplace=False, limit=None + ): # noqa: D102 + if isinstance(value, (pd.Series, pd.DataFrame)): + value = cudf.from_pandas(value) + if isinstance(value, cudf.Series): + # Align value.index to self.columns + value = value.reindex(self._column_names) + elif isinstance(value, cudf.DataFrame): + if not self.index.equals(value.index): + # Align value.index to self.index + value = value.reindex(self.index) + value = dict(value.items()) + elif isinstance(value, abc.Mapping): + # Align value.indexes to self.index + value = { + key: value.reindex(self.index) + if isinstance(value, cudf.Series) + else value + for key, value in value.items() + } + return super().fillna( + value=value, method=method, axis=axis, inplace=inplace, limit=limit + ) + @_cudf_nvtx_annotate def where(self, cond, other=None, inplace=False): from cudf.core._internals.where import ( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 38bff3946d6..8ca71180c00 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2,7 +2,6 @@ from __future__ import annotations -import copy import operator import pickle import warnings @@ -20,6 +19,7 @@ import cudf from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar +from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -38,7 +38,7 @@ if TYPE_CHECKING: from types import ModuleType - from cudf._typing import Dtype + from cudf._typing import Dtype, ScalarLike # TODO: It looks like Frame is missing a declaration of `copy`, need to add @@ -613,8 +613,8 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None: @_cudf_nvtx_annotate def fillna( self, - value=None, - method: Literal["ffill", "bfill", "pad", "backfill"] | None = None, + value: None | ScalarLike | cudf.Series = None, + method: Literal["ffill", "bfill", "pad", "backfill", None] = None, axis=None, inplace: bool = False, limit=None, @@ -725,6 +725,16 @@ def fillna( raise ValueError("Cannot specify both 'value' and 'method'.") if method: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) if method not in {"ffill", "bfill", "pad", "backfill"}: raise NotImplementedError( f"Fill method {method} is not supported" @@ -734,57 +744,24 @@ def fillna( elif method == "backfill": method = "bfill" - # TODO: This logic should be handled in different subclasses since - # different Frames support different types of values. - if isinstance(value, cudf.Series): - value = value.reindex(self._data.names) - elif isinstance(value, cudf.DataFrame): - if not self.index.equals(value.index): # type: ignore[attr-defined] - value = value.reindex(self.index) # type: ignore[attr-defined] - else: - value = value - elif not isinstance(value, abc.Mapping): - value = {name: copy.deepcopy(value) for name in self._data.names} - else: - value = { - key: value.reindex(self.index) # type: ignore[attr-defined] - if isinstance(value, cudf.Series) - else value - for key, value in value.items() - } - - filled_data = {} - for col_name, col in self._data.items(): - if col_name in value and method is None: - replace_val = value[col_name] - else: - replace_val = None - should_fill = ( - ( - col_name in value - and col.has_nulls(include_nan=True) - and not libcudf.scalar._is_null_host_scalar(replace_val) - ) - or method is not None - or ( - isinstance(col, cudf.core.column.CategoricalColumn) - and not libcudf.scalar._is_null_host_scalar(replace_val) - ) + if is_scalar(value): + value = {name: value for name in self._column_names} + elif not isinstance(value, (abc.Mapping, cudf.Series)): + raise TypeError( + f'"value" parameter must be a scalar, dict ' + f"or Series, but you passed a " + f'"{type(value).__name__}"' ) - if should_fill: - filled_data[col_name] = col.fillna(replace_val, method) - else: - filled_data[col_name] = col.copy(deep=True) + + filled_columns = [ + col.fillna(value[name], method) if name in value else col.copy() + for name, col in self._data.items() + ] return self._mimic_inplace( - self._from_data( - data=ColumnAccessor( - data=filled_data, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - rangeindex=self._data.rangeindex, - label_dtype=self._data.label_dtype, - verify=False, + self._from_data_like_self( + self._data._from_columns_like_self( + filled_columns, verify=False ) ), inplace=inplace, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5cae4a857ee..280a6e92eab 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3217,29 +3217,6 @@ def _split(self, splits, keep_index=True): for i in range(len(splits) + 1) ] - @_cudf_nvtx_annotate - def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): # noqa: D102 - if method is not None: - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - f"{type(self).__name__}.fillna with 'method' is " - "deprecated and will raise in a future version. " - "Use obj.ffill() or obj.bfill() instead.", - FutureWarning, - ) - old_index = self.index - ret = super().fillna(value, method, axis, inplace, limit) - if inplace: - self.index = old_index - else: - ret.index = old_index - return ret - @_cudf_nvtx_annotate def bfill(self, value=None, axis=None, inplace=None, limit=None): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c0716d7709a..15ad0813601 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1797,20 +1797,12 @@ def fillna( ): if isinstance(value, pd.Series): value = Series.from_pandas(value) - - if not (is_scalar(value) or isinstance(value, (abc.Mapping, Series))): - raise TypeError( - f'"value" parameter must be a scalar, dict ' - f"or Series, but you passed a " - f'"{type(value).__name__}"' - ) - - if isinstance(value, (abc.Mapping, Series)): + elif isinstance(value, abc.Mapping): value = Series(value) + if isinstance(value, cudf.Series): if not self.index.equals(value.index): value = value.reindex(self.index) - value = value._column - + value = {self.name: value._column} return super().fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 87ec365868b..467d0c46ae7 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1054,6 +1054,18 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): assert_eq(expected, actual) +def test_fillna_categorical_with_non_categorical_raises(): + ser = cudf.Series([1, None], dtype="category") + with pytest.raises(TypeError): + ser.fillna(cudf.Series([1, 2])) + + +def test_fillna_categorical_with_different_categories_raises(): + ser = cudf.Series([1, None], dtype="category") + with pytest.raises(TypeError): + ser.fillna(cudf.Series([1, 2]), dtype="category") + + def test_series_mask_mixed_dtypes_error(): s = cudf.Series(["a", "b", "c"]) with pytest.raises(