From 6a23775db29dc4b38820994297c94201c9287aaf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 15:28:34 -1000 Subject: [PATCH] Ensure column.fillna signatures are consistent (#14724) Aligns the definitions of `Columns.fillna` among all subclasses. `dtype` looks to only needed in certain instances to cast the fill value so can do that separately. A `fill_nan` can be avoided with its single usage in a `can_cast` routine by checking for nan first Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14724 --- python/cudf/cudf/core/column/categorical.py | 9 ++++---- python/cudf/cudf/core/column/column.py | 9 ++++---- python/cudf/cudf/core/column/datetime.py | 5 +++-- python/cudf/cudf/core/column/decimal.py | 22 +++++++++---------- python/cudf/cudf/core/column/numerical.py | 19 ++++++++-------- python/cudf/cudf/core/column/string.py | 12 +++++------ python/cudf/cudf/core/column/timedelta.py | 24 ++++++++++----------- python/cudf/cudf/tests/test_timedelta.py | 3 +-- 8 files changed, 50 insertions(+), 53 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7036a9ee870..c7e7cf2bf7e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1236,9 +1236,8 @@ def notnull(self) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Any = None, - dtype: Optional[Dtype] = None, - ) -> CategoricalColumn: + method: Optional[str] = None, + ) -> Self: """ Fill null values with *fill_value* """ @@ -1276,7 +1275,7 @@ def fillna( self.codes.dtype ) - return super().fillna(value=fill_value, method=method) + return super().fillna(fill_value, method=method) def indices_of( self, value: ScalarLike diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 296fd6a41b0..440ac855691 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -710,16 +710,15 @@ def _check_scatter_key_length( def fillna( self, - value: Any = None, + fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=value, method=method, dtype=dtype + input_col=self, replacement=fill_value, method=method )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -929,7 +928,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(rhs.null_count > 0, dtype=bool) + result = result.fillna(cudf.Scalar(rhs.null_count > 0)) return result def as_mask(self) -> Buffer: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 466ea3220c8..5aa75365389 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -598,12 +599,12 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> DatetimeColumn: + ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) if is_scalar(fill_value): + # TODO: Add cast checking like TimedeltaColumn.fillna if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=self.dtype) else: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 02e03f92745..299875f0091 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import warnings from decimal import Decimal @@ -7,6 +7,7 @@ import cupy as cp import numpy as np import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -125,29 +126,28 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): def fillna( self, - value: Any = None, + fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ): + ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ - if isinstance(value, (int, Decimal)): - value = cudf.Scalar(value, dtype=self.dtype) + if isinstance(fill_value, (int, Decimal)): + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) elif ( - isinstance(value, DecimalBaseColumn) - or isinstance(value, cudf.core.column.NumericalColumn) - and is_integer_dtype(value.dtype) + isinstance(fill_value, DecimalBaseColumn) + or isinstance(fill_value, cudf.core.column.NumericalColumn) + and is_integer_dtype(fill_value.dtype) ): - value = value.astype(self.dtype) + fill_value = fill_value.astype(self.dtype) else: raise TypeError( "Decimal columns only support using fillna with decimal and " "integer values" ) - return super().fillna(value=value, method=method) + return super().fillna(fill_value, method=method) def normalize_binop_value(self, other): if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f40886bf153..e848c86897f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -16,6 +16,7 @@ import cupy as cp import numpy as np import pandas as pd +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -291,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - def nans_to_nulls(self: NumericalColumn) -> NumericalColumn: + def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. if self.dtype.kind != "f" or self.nan_count == 0: return self @@ -533,13 +534,11 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - fill_nan: bool = True, - ) -> NumericalColumn: + ) -> Self: """ Fill null values with *fill_value* """ - col = self.nans_to_nulls() if fill_nan else self + col = self.nans_to_nulls() if col.null_count == 0: return col @@ -574,8 +573,8 @@ def fillna( if not (new_fill_value == fill_value).all(): raise TypeError( f"Cannot safely cast non-equivalent " - f"{col.dtype.type.__name__} to " - f"{cudf.dtype(dtype).type.__name__}" + f"{fill_value.dtype.type.__name__} to " + f"{col.dtype.type.__name__}" ) fill_value = new_fill_value else: @@ -652,12 +651,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: # want to cast float to int: elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}: + if self.nan_count > 0: + return False iinfo = np.iinfo(to_dtype) min_, max_ = iinfo.min, iinfo.max # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): - filled = self.fillna(0, fill_nan=False) + filled = self.fillna(0) return (cudf.Series(filled) % 1 == 0).all() else: return False diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7bf81f3e2d3..06b5ac31ca6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -21,6 +21,7 @@ import pandas as pd import pyarrow as pa from numba import cuda +from typing_extensions import Self import cudf import cudf.api.types @@ -5824,17 +5825,16 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> StringColumn: + ) -> Self: if fill_value is not None: if not is_scalar(fill_value): fill_value = column.as_column(fill_value, dtype=self.dtype) elif cudf._lib.scalar._is_null_host_scalar(fill_value): # Trying to fill with value? Return copy. return self.copy(deep=True) - return super().fillna(value=fill_value, dtype="object") - else: - return super().fillna(method=method) + else: + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) + return super().fillna(fill_value, method=method) def normalize_binop_value( self, other diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 572b3b894dc..d664b0f18df 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -281,24 +282,21 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> TimeDeltaColumn: + ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) - col: ColumnBase = self if is_scalar(fill_value): - if isinstance(fill_value, np.timedelta64): - dtype = determine_out_dtype(self.dtype, fill_value.dtype) - fill_value = fill_value.astype(dtype) - col = col.astype(dtype) - if not isinstance(fill_value, cudf.Scalar): - fill_value = cudf.Scalar(fill_value, dtype=dtype) + fill_value = cudf.Scalar(fill_value) + dtype = determine_out_dtype(self.dtype, fill_value.dtype) + fill_value = fill_value.astype(dtype) + if self.dtype != dtype: + return cast( + Self, self.astype(dtype).fillna(fill_value, method) + ) else: fill_value = column.as_column(fill_value, nan_as_null=False) - return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value)) - else: - return super().fillna(method=method) + return super().fillna(fill_value, method) def as_numerical_column( self, dtype: Dtype, **kwargs diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 139ce1c4ca3..d86612d3143 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import datetime import operator @@ -1024,7 +1024,6 @@ def local_assert(expected, actual): [ np.timedelta64(4, "s"), np.timedelta64(456, "D"), - np.timedelta64(46, "h"), np.timedelta64("nat"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"),