Skip to content

Commit

Permalink
Ensure column.fillna signatures are consistent (#14724)
Browse files Browse the repository at this point in the history
Aligns the definitions of `Columns.fillna` among all subclasses. `dtype` looks to only needed in certain instances to cast the fill value so can do that separately. A `fill_nan` can be avoided with its single usage in a `can_cast` routine by checking for nan first

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #14724
  • Loading branch information
mroeschke authored Jan 10, 2024
1 parent 8df33ee commit 6a23775
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 53 deletions.
9 changes: 4 additions & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -1236,9 +1236,8 @@ def notnull(self) -> ColumnBase:
def fillna(
self,
fill_value: Any = None,
method: Any = None,
dtype: Optional[Dtype] = None,
) -> CategoricalColumn:
method: Optional[str] = None,
) -> Self:
"""
Fill null values with *fill_value*
"""
Expand Down Expand Up @@ -1276,7 +1275,7 @@ def fillna(
self.codes.dtype
)

return super().fillna(value=fill_value, method=method)
return super().fillna(fill_value, method=method)

def indices_of(
self, value: ScalarLike
Expand Down
9 changes: 4 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -710,16 +710,15 @@ def _check_scatter_key_length(

def fillna(
self,
value: Any = None,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> Self:
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
return libcudf.replace.replace_nulls(
input_col=self, replacement=value, method=method, dtype=dtype
input_col=self, replacement=fill_value, method=method
)._with_type_metadata(self.dtype)

def isnull(self) -> ColumnBase:
Expand Down Expand Up @@ -929,7 +928,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
# https://github.com/rapidsai/cudf/issues/14515 by
# providing a mode in which cudf::contains does not mask
# the result.
result = result.fillna(rhs.null_count > 0, dtype=bool)
result = result.fillna(cudf.Scalar(rhs.null_count > 0))
return result

def as_mask(self) -> Buffer:
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -598,12 +599,12 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> DatetimeColumn:
) -> Self:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return self.copy(deep=True)
if is_scalar(fill_value):
# TODO: Add cast checking like TimedeltaColumn.fillna
if not isinstance(fill_value, cudf.Scalar):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
else:
Expand Down
22 changes: 11 additions & 11 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

import warnings
from decimal import Decimal
Expand All @@ -7,6 +7,7 @@
import cupy as cp
import numpy as np
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -125,29 +126,28 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):

def fillna(
self,
value: Any = None,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
):
) -> Self:
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
if isinstance(value, (int, Decimal)):
value = cudf.Scalar(value, dtype=self.dtype)
if isinstance(fill_value, (int, Decimal)):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
elif (
isinstance(value, DecimalBaseColumn)
or isinstance(value, cudf.core.column.NumericalColumn)
and is_integer_dtype(value.dtype)
isinstance(fill_value, DecimalBaseColumn)
or isinstance(fill_value, cudf.core.column.NumericalColumn)
and is_integer_dtype(fill_value.dtype)
):
value = value.astype(self.dtype)
fill_value = fill_value.astype(self.dtype)
else:
raise TypeError(
"Decimal columns only support using fillna with decimal and "
"integer values"
)

return super().fillna(value=value, method=method)
return super().fillna(fill_value, method=method)

def normalize_binop_value(self, other):
if isinstance(other, ColumnBase):
Expand Down
19 changes: 10 additions & 9 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -16,6 +16,7 @@
import cupy as cp
import numpy as np
import pandas as pd
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -291,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:

return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)

def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:
def nans_to_nulls(self: Self) -> Self:
# Only floats can contain nan.
if self.dtype.kind != "f" or self.nan_count == 0:
return self
Expand Down Expand Up @@ -533,13 +534,11 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
fill_nan: bool = True,
) -> NumericalColumn:
) -> Self:
"""
Fill null values with *fill_value*
"""
col = self.nans_to_nulls() if fill_nan else self
col = self.nans_to_nulls()

if col.null_count == 0:
return col
Expand Down Expand Up @@ -574,8 +573,8 @@ def fillna(
if not (new_fill_value == fill_value).all():
raise TypeError(
f"Cannot safely cast non-equivalent "
f"{col.dtype.type.__name__} to "
f"{cudf.dtype(dtype).type.__name__}"
f"{fill_value.dtype.type.__name__} to "
f"{col.dtype.type.__name__}"
)
fill_value = new_fill_value
else:
Expand Down Expand Up @@ -652,12 +651,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:

# want to cast float to int:
elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}:
if self.nan_count > 0:
return False
iinfo = np.iinfo(to_dtype)
min_, max_ = iinfo.min, iinfo.max

# best we can do is hope to catch it here and avoid compare
if (self.min() >= min_) and (self.max() <= max_):
filled = self.fillna(0, fill_nan=False)
filled = self.fillna(0)
return (cudf.Series(filled) % 1 == 0).all()
else:
return False
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -21,6 +21,7 @@
import pandas as pd
import pyarrow as pa
from numba import cuda
from typing_extensions import Self

import cudf
import cudf.api.types
Expand Down Expand Up @@ -5824,17 +5825,16 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> StringColumn:
) -> Self:
if fill_value is not None:
if not is_scalar(fill_value):
fill_value = column.as_column(fill_value, dtype=self.dtype)
elif cudf._lib.scalar._is_null_host_scalar(fill_value):
# Trying to fill <NA> with <NA> value? Return copy.
return self.copy(deep=True)
return super().fillna(value=fill_value, dtype="object")
else:
return super().fillna(method=method)
else:
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
return super().fillna(fill_value, method=method)

def normalize_binop_value(
self, other
Expand Down
24 changes: 11 additions & 13 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -281,24 +282,21 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> TimeDeltaColumn:
) -> Self:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return self.copy(deep=True)
col: ColumnBase = self
if is_scalar(fill_value):
if isinstance(fill_value, np.timedelta64):
dtype = determine_out_dtype(self.dtype, fill_value.dtype)
fill_value = fill_value.astype(dtype)
col = col.astype(dtype)
if not isinstance(fill_value, cudf.Scalar):
fill_value = cudf.Scalar(fill_value, dtype=dtype)
fill_value = cudf.Scalar(fill_value)
dtype = determine_out_dtype(self.dtype, fill_value.dtype)
fill_value = fill_value.astype(dtype)
if self.dtype != dtype:
return cast(
Self, self.astype(dtype).fillna(fill_value, method)
)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value))
else:
return super().fillna(method=method)
return super().fillna(fill_value, method)

def as_numerical_column(
self, dtype: Dtype, **kwargs
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/tests/test_timedelta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import datetime
import operator
Expand Down Expand Up @@ -1024,7 +1024,6 @@ def local_assert(expected, actual):
[
np.timedelta64(4, "s"),
np.timedelta64(456, "D"),
np.timedelta64(46, "h"),
np.timedelta64("nat"),
np.timedelta64(1, "s"),
np.timedelta64(1, "ms"),
Expand Down

0 comments on commit 6a23775

Please sign in to comment.