From bbe15b69a529d26dbf4d9a0cd7aeb58d427d31e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jan 2024 15:21:00 -0800 Subject: [PATCH 1/3] Ensure column.fillna signatures are consistent --- python/cudf/cudf/core/column/categorical.py | 9 ++++----- python/cudf/cudf/core/column/column.py | 9 ++++----- python/cudf/cudf/core/column/datetime.py | 4 ++-- python/cudf/cudf/core/column/decimal.py | 22 ++++++++++----------- python/cudf/cudf/core/column/numerical.py | 19 +++++++++--------- python/cudf/cudf/core/column/string.py | 12 ++++++----- python/cudf/cudf/core/column/timedelta.py | 10 +++++----- 7 files changed, 43 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7036a9ee870..c7e7cf2bf7e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1236,9 +1236,8 @@ def notnull(self) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Any = None, - dtype: Optional[Dtype] = None, - ) -> CategoricalColumn: + method: Optional[str] = None, + ) -> Self: """ Fill null values with *fill_value* """ @@ -1276,7 +1275,7 @@ def fillna( self.codes.dtype ) - return super().fillna(value=fill_value, method=method) + return super().fillna(fill_value, method=method) def indices_of( self, value: ScalarLike diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 296fd6a41b0..440ac855691 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -710,16 +710,15 @@ def _check_scatter_key_length( def fillna( self, - value: Any = None, + fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=value, method=method, dtype=dtype + input_col=self, replacement=fill_value, method=method )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -929,7 +928,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(rhs.null_count > 0, dtype=bool) + result = result.fillna(cudf.Scalar(rhs.null_count > 0)) return result def as_mask(self) -> Buffer: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 466ea3220c8..71f190ef60a 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -598,8 +599,7 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> DatetimeColumn: + ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 02e03f92745..299875f0091 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import warnings from decimal import Decimal @@ -7,6 +7,7 @@ import cupy as cp import numpy as np import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -125,29 +126,28 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): def fillna( self, - value: Any = None, + fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ): + ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ - if isinstance(value, (int, Decimal)): - value = cudf.Scalar(value, dtype=self.dtype) + if isinstance(fill_value, (int, Decimal)): + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) elif ( - isinstance(value, DecimalBaseColumn) - or isinstance(value, cudf.core.column.NumericalColumn) - and is_integer_dtype(value.dtype) + isinstance(fill_value, DecimalBaseColumn) + or isinstance(fill_value, cudf.core.column.NumericalColumn) + and is_integer_dtype(fill_value.dtype) ): - value = value.astype(self.dtype) + fill_value = fill_value.astype(self.dtype) else: raise TypeError( "Decimal columns only support using fillna with decimal and " "integer values" ) - return super().fillna(value=value, method=method) + return super().fillna(fill_value, method=method) def normalize_binop_value(self, other): if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f40886bf153..e848c86897f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -16,6 +16,7 @@ import cupy as cp import numpy as np import pandas as pd +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -291,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - def nans_to_nulls(self: NumericalColumn) -> NumericalColumn: + def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. if self.dtype.kind != "f" or self.nan_count == 0: return self @@ -533,13 +534,11 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - fill_nan: bool = True, - ) -> NumericalColumn: + ) -> Self: """ Fill null values with *fill_value* """ - col = self.nans_to_nulls() if fill_nan else self + col = self.nans_to_nulls() if col.null_count == 0: return col @@ -574,8 +573,8 @@ def fillna( if not (new_fill_value == fill_value).all(): raise TypeError( f"Cannot safely cast non-equivalent " - f"{col.dtype.type.__name__} to " - f"{cudf.dtype(dtype).type.__name__}" + f"{fill_value.dtype.type.__name__} to " + f"{col.dtype.type.__name__}" ) fill_value = new_fill_value else: @@ -652,12 +651,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: # want to cast float to int: elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}: + if self.nan_count > 0: + return False iinfo = np.iinfo(to_dtype) min_, max_ = iinfo.min, iinfo.max # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): - filled = self.fillna(0, fill_nan=False) + filled = self.fillna(0) return (cudf.Series(filled) % 1 == 0).all() else: return False diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7bf81f3e2d3..1f7e2fe846d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -21,6 +21,7 @@ import pandas as pd import pyarrow as pa from numba import cuda +from typing_extensions import Self import cudf import cudf.api.types @@ -5824,15 +5825,16 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> StringColumn: + ) -> Self: if fill_value is not None: if not is_scalar(fill_value): fill_value = column.as_column(fill_value, dtype=self.dtype) - elif cudf._lib.scalar._is_null_host_scalar(fill_value): + else: + fill_value = cudf.Scalar(fill_value, dtype="object") + if cudf._lib.scalar._is_null_host_scalar(fill_value): # Trying to fill with value? Return copy. return self.copy(deep=True) - return super().fillna(value=fill_value, dtype="object") + return super().fillna(fill_value) else: return super().fillna(method=method) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 572b3b894dc..2441fc9888f 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -281,8 +282,7 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> TimeDeltaColumn: + ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) @@ -293,10 +293,10 @@ def fillna( fill_value = fill_value.astype(dtype) col = col.astype(dtype) if not isinstance(fill_value, cudf.Scalar): - fill_value = cudf.Scalar(fill_value, dtype=dtype) + fill_value = cudf.Scalar(fill_value) else: fill_value = column.as_column(fill_value, nan_as_null=False) - return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value)) + return super().fillna(fill_value) else: return super().fillna(method=method) From b3e09b691d7a7427bce722b7ead499bf60fd61ab Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:05:54 -0800 Subject: [PATCH 2/3] fix timedelta --- python/cudf/cudf/core/column/datetime.py | 1 + python/cudf/cudf/core/column/timedelta.py | 18 ++++++++---------- python/cudf/cudf/tests/test_timedelta.py | 3 +-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 71f190ef60a..5aa75365389 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -604,6 +604,7 @@ def fillna( if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) if is_scalar(fill_value): + # TODO: Add cast checking like TimedeltaColumn.fillna if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=self.dtype) else: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2441fc9888f..d664b0f18df 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -286,19 +286,17 @@ def fillna( if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) - col: ColumnBase = self if is_scalar(fill_value): - if isinstance(fill_value, np.timedelta64): - dtype = determine_out_dtype(self.dtype, fill_value.dtype) - fill_value = fill_value.astype(dtype) - col = col.astype(dtype) - if not isinstance(fill_value, cudf.Scalar): - fill_value = cudf.Scalar(fill_value) + fill_value = cudf.Scalar(fill_value) + dtype = determine_out_dtype(self.dtype, fill_value.dtype) + fill_value = fill_value.astype(dtype) + if self.dtype != dtype: + return cast( + Self, self.astype(dtype).fillna(fill_value, method) + ) else: fill_value = column.as_column(fill_value, nan_as_null=False) - return super().fillna(fill_value) - else: - return super().fillna(method=method) + return super().fillna(fill_value, method) def as_numerical_column( self, dtype: Dtype, **kwargs diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 139ce1c4ca3..d86612d3143 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import datetime import operator @@ -1024,7 +1024,6 @@ def local_assert(expected, actual): [ np.timedelta64(4, "s"), np.timedelta64(456, "D"), - np.timedelta64(46, "h"), np.timedelta64("nat"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), From 28c93dc84364dbc9b871a7d992bdc9302d8c6e47 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:24:05 -0800 Subject: [PATCH 3/3] Fix string fillna --- python/cudf/cudf/core/column/string.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1f7e2fe846d..06b5ac31ca6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5829,14 +5829,12 @@ def fillna( if fill_value is not None: if not is_scalar(fill_value): fill_value = column.as_column(fill_value, dtype=self.dtype) - else: - fill_value = cudf.Scalar(fill_value, dtype="object") - if cudf._lib.scalar._is_null_host_scalar(fill_value): + elif cudf._lib.scalar._is_null_host_scalar(fill_value): # Trying to fill with value? Return copy. return self.copy(deep=True) - return super().fillna(fill_value) - else: - return super().fillna(method=method) + else: + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) + return super().fillna(fill_value, method=method) def normalize_binop_value( self, other