Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure column.fillna signatures are consistent #14724

Merged
merged 4 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -1236,9 +1236,8 @@ def notnull(self) -> ColumnBase:
def fillna(
self,
fill_value: Any = None,
method: Any = None,
dtype: Optional[Dtype] = None,
) -> CategoricalColumn:
method: Optional[str] = None,
Copy link
Contributor

@bdice bdice Jan 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking -- this is non-public so the API break of dropping dtype is not a "breaking change" right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been working under that assumption that cudf column objects are not public cc @shwina

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that should be fine! Just wanted to check in case this interacts with any future changes you were planning for public types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. Yeah the arguments being modified in Column.fillna do not clash with Series/DataFrame.fillna

) -> Self:
"""
Fill null values with *fill_value*
"""
Expand Down Expand Up @@ -1276,7 +1275,7 @@ def fillna(
self.codes.dtype
)

return super().fillna(value=fill_value, method=method)
return super().fillna(fill_value, method=method)

def indices_of(
self, value: ScalarLike
Expand Down
9 changes: 4 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -710,16 +710,15 @@ def _check_scatter_key_length(

def fillna(
self,
value: Any = None,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> Self:
"""Fill null values with ``value``.

Returns a copy with null filled.
"""
return libcudf.replace.replace_nulls(
input_col=self, replacement=value, method=method, dtype=dtype
input_col=self, replacement=fill_value, method=method
)._with_type_metadata(self.dtype)

def isnull(self) -> ColumnBase:
Expand Down Expand Up @@ -929,7 +928,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
# https://github.com/rapidsai/cudf/issues/14515 by
# providing a mode in which cudf::contains does not mask
# the result.
result = result.fillna(rhs.null_count > 0, dtype=bool)
result = result.fillna(cudf.Scalar(rhs.null_count > 0))
return result

def as_mask(self) -> Buffer:
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -598,12 +599,12 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> DatetimeColumn:
) -> Self:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return self.copy(deep=True)
if is_scalar(fill_value):
# TODO: Add cast checking like TimedeltaColumn.fillna
if not isinstance(fill_value, cudf.Scalar):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
else:
Expand Down
22 changes: 11 additions & 11 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

import warnings
from decimal import Decimal
Expand All @@ -7,6 +7,7 @@
import cupy as cp
import numpy as np
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -125,29 +126,28 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):

def fillna(
self,
value: Any = None,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
):
) -> Self:
"""Fill null values with ``value``.

Returns a copy with null filled.
"""
if isinstance(value, (int, Decimal)):
value = cudf.Scalar(value, dtype=self.dtype)
if isinstance(fill_value, (int, Decimal)):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
elif (
isinstance(value, DecimalBaseColumn)
or isinstance(value, cudf.core.column.NumericalColumn)
and is_integer_dtype(value.dtype)
isinstance(fill_value, DecimalBaseColumn)
or isinstance(fill_value, cudf.core.column.NumericalColumn)
and is_integer_dtype(fill_value.dtype)
):
value = value.astype(self.dtype)
fill_value = fill_value.astype(self.dtype)
else:
raise TypeError(
"Decimal columns only support using fillna with decimal and "
"integer values"
)

return super().fillna(value=value, method=method)
return super().fillna(fill_value, method=method)

def normalize_binop_value(self, other):
if isinstance(other, ColumnBase):
Expand Down
19 changes: 10 additions & 9 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -16,6 +16,7 @@
import cupy as cp
import numpy as np
import pandas as pd
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -291,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:

return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)

def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:
def nans_to_nulls(self: Self) -> Self:
# Only floats can contain nan.
if self.dtype.kind != "f" or self.nan_count == 0:
return self
Expand Down Expand Up @@ -533,13 +534,11 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
fill_nan: bool = True,
) -> NumericalColumn:
) -> Self:
"""
Fill null values with *fill_value*
"""
col = self.nans_to_nulls() if fill_nan else self
col = self.nans_to_nulls()

if col.null_count == 0:
return col
Expand Down Expand Up @@ -574,8 +573,8 @@ def fillna(
if not (new_fill_value == fill_value).all():
raise TypeError(
f"Cannot safely cast non-equivalent "
f"{col.dtype.type.__name__} to "
f"{cudf.dtype(dtype).type.__name__}"
f"{fill_value.dtype.type.__name__} to "
f"{col.dtype.type.__name__}"
)
fill_value = new_fill_value
else:
Expand Down Expand Up @@ -652,12 +651,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:

# want to cast float to int:
elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}:
if self.nan_count > 0:
return False
iinfo = np.iinfo(to_dtype)
min_, max_ = iinfo.min, iinfo.max

# best we can do is hope to catch it here and avoid compare
if (self.min() >= min_) and (self.max() <= max_):
filled = self.fillna(0, fill_nan=False)
filled = self.fillna(0)
return (cudf.Series(filled) % 1 == 0).all()
else:
return False
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -21,6 +21,7 @@
import pandas as pd
import pyarrow as pa
from numba import cuda
from typing_extensions import Self

import cudf
import cudf.api.types
Expand Down Expand Up @@ -5824,17 +5825,16 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> StringColumn:
) -> Self:
if fill_value is not None:
if not is_scalar(fill_value):
fill_value = column.as_column(fill_value, dtype=self.dtype)
elif cudf._lib.scalar._is_null_host_scalar(fill_value):
# Trying to fill <NA> with <NA> value? Return copy.
return self.copy(deep=True)
return super().fillna(value=fill_value, dtype="object")
else:
return super().fillna(method=method)
else:
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
return super().fillna(fill_value, method=method)

def normalize_binop_value(
self, other
Expand Down
24 changes: 11 additions & 13 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -281,24 +282,21 @@ def fillna(
self,
fill_value: Any = None,
method: Optional[str] = None,
dtype: Optional[Dtype] = None,
) -> TimeDeltaColumn:
) -> Self:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return self.copy(deep=True)
col: ColumnBase = self
if is_scalar(fill_value):
if isinstance(fill_value, np.timedelta64):
dtype = determine_out_dtype(self.dtype, fill_value.dtype)
fill_value = fill_value.astype(dtype)
col = col.astype(dtype)
if not isinstance(fill_value, cudf.Scalar):
fill_value = cudf.Scalar(fill_value, dtype=dtype)
fill_value = cudf.Scalar(fill_value)
dtype = determine_out_dtype(self.dtype, fill_value.dtype)
fill_value = fill_value.astype(dtype)
if self.dtype != dtype:
return cast(
Self, self.astype(dtype).fillna(fill_value, method)
)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value))
else:
return super().fillna(method=method)
return super().fillna(fill_value, method)

def as_numerical_column(
self, dtype: Dtype, **kwargs
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/tests/test_timedelta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import datetime
import operator
Expand Down Expand Up @@ -1024,7 +1024,6 @@ def local_assert(expected, actual):
[
np.timedelta64(4, "s"),
np.timedelta64(456, "D"),
np.timedelta64(46, "h"),
np.timedelta64("nat"),
np.timedelta64(1, "s"),
np.timedelta64(1, "ms"),
Expand Down