Skip to content

Commit

Permalink
Refactor fillna logic to push specifics toward Frame subclasses and C…
Browse files Browse the repository at this point in the history
…olumn subclasses (#15957)

Essentially 2 reorganizations

1. `Frame.fillna` input argument logic was pushed toward its subclasses `Series`/`DataFrame`/`IndexedFrame` where appripriate
2. `Column.fillna` was made generic. Column subclasses now implement `_validate_fillna_value` used by `Column.fillna` to validate the fill value

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15957
  • Loading branch information
mroeschke authored Jun 25, 2024
1 parent 1bc1f45 commit bc08662
Show file tree
Hide file tree
Showing 12 changed files with 155 additions and 261 deletions.
79 changes: 32 additions & 47 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,51 +1068,34 @@ def notnull(self) -> ColumnBase:

return result

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
"""
Fill null values with *fill_value*
"""
if fill_value is not None:
fill_is_scalar = np.isscalar(fill_value)

if fill_is_scalar:
if fill_value == _DEFAULT_CATEGORICAL_VALUE:
fill_value = self.codes.dtype.type(fill_value)
else:
try:
fill_value = self._encode(fill_value)
fill_value = self.codes.dtype.type(fill_value)
except ValueError as err:
err_msg = "fill value must be in categories"
raise ValueError(err_msg) from err
def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if cudf.api.types.is_scalar(fill_value):
if fill_value != _DEFAULT_CATEGORICAL_VALUE:
try:
fill_value = self._encode(fill_value)
except ValueError as err:
raise ValueError(
f"{fill_value=} must be in categories"
) from err
return cudf.Scalar(fill_value, dtype=self.codes.dtype)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
if isinstance(fill_value.dtype, CategoricalDtype):
if self.dtype != fill_value.dtype:
raise TypeError(
"Cannot set a categorical with another without identical categories"
)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
if isinstance(fill_value, CategoricalColumn):
if self.dtype != fill_value.dtype:
raise TypeError(
"Cannot set a Categorical with another, "
"without identical categories"
)
# TODO: only required if fill_value has a subset of the
# categories:
fill_value = fill_value._set_categories(
self.categories,
is_unique=True,
)
fill_value = column.as_column(fill_value.codes).astype(
self.codes.dtype
raise TypeError(
"Cannot set a categorical with non-categorical data"
)

# Validation of `fill_value` will have to be performed
# before returning self.
if not self.nullable:
return self

return super().fillna(fill_value, method=method)
fill_value = fill_value._set_categories(
self.categories,
)
return fill_value.codes.astype(self.codes.dtype)

def indices_of(
self, value: ScalarLike
Expand Down Expand Up @@ -1372,11 +1355,13 @@ def _set_categories(
if not (is_unique or new_cats.is_unique):
new_cats = cudf.Series(new_cats)._column.unique()

if cur_cats.equals(new_cats, check_dtypes=True):
# TODO: Internal usages don't always need a copy; add a copy keyword
# as_ordered shallow copies
return self.copy().as_ordered(ordered=ordered)

cur_codes = self.codes
max_cat_size = (
len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)
)
out_code_dtype = min_unsigned_type(max_cat_size)
out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats)))

cur_order = column.as_column(range(len(cur_codes)))
old_codes = column.as_column(
Expand Down
21 changes: 19 additions & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,15 +666,32 @@ def _check_scatter_key_length(
f"{num_keys}"
)

def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if is_scalar(fill_value):
return cudf.Scalar(fill_value, dtype=self.dtype)
return as_column(fill_value)

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
fill_value: ScalarLike | ColumnLike,
method: Literal["ffill", "bfill", None] = None,
) -> Self:
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
if not self.has_nulls(include_nan=True):
return self.copy()
elif method is None:
if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
fill_value
):
return self.copy()
else:
fill_value = self._validate_fillna_value(fill_value)
return libcudf.replace.replace_nulls(
input_col=self.nans_to_nulls(),
replacement=fill_value,
Expand Down
21 changes: 2 additions & 19 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,17 @@
import locale
import re
from locale import nl_langinfo
from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
from typing import TYPE_CHECKING, Literal, Sequence, cast

import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
Expand Down Expand Up @@ -641,22 +640,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
else:
return result_col

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return self.copy(deep=True)
if is_scalar(fill_value):
if not isinstance(fill_value, cudf.Scalar):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)

return super().fillna(fill_value, method)

def indices_of(
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
Expand Down
39 changes: 14 additions & 25 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@

import warnings
from decimal import Decimal
from typing import TYPE_CHECKING, Any, Sequence, cast
from typing import TYPE_CHECKING, Sequence, cast

import cupy as cp
import numpy as np
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand All @@ -31,7 +30,7 @@
from .numerical_base import NumericalBaseColumn

if TYPE_CHECKING:
from cudf._typing import ColumnBinaryOperand, Dtype
from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike


class DecimalBaseColumn(NumericalBaseColumn):
Expand Down Expand Up @@ -135,30 +134,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):

return result

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if isinstance(fill_value, (int, Decimal)):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
elif (
isinstance(fill_value, DecimalBaseColumn)
or isinstance(fill_value, cudf.core.column.NumericalColumn)
and is_integer_dtype(fill_value.dtype)
return cudf.Scalar(fill_value, dtype=self.dtype)
elif isinstance(fill_value, ColumnBase) and (
isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu"
):
fill_value = fill_value.astype(self.dtype)
else:
raise TypeError(
"Decimal columns only support using fillna with decimal and "
"integer values"
)

return super().fillna(fill_value, method=method)
return fill_value.astype(self.dtype)
raise TypeError(
"Decimal columns only support using fillna with decimal and "
"integer values"
)

def normalize_binop_value(self, other):
if isinstance(other, ColumnBase):
Expand Down
63 changes: 16 additions & 47 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,57 +532,26 @@ def find_and_replace(
replaced, df._data["old"], df._data["new"]
)

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
"""
Fill null values with *fill_value*
"""
col = self.nans_to_nulls()

if col.null_count == 0:
return col

if method is not None:
return super().fillna(fill_value, method)

if fill_value is None:
raise ValueError("Must specify either 'fill_value' or 'method'")

if (
isinstance(fill_value, cudf.Scalar)
and fill_value.dtype == col.dtype
):
return super().fillna(fill_value, method)

if np.isscalar(fill_value):
# cast safely to the same dtype as self
fill_value_casted = col.dtype.type(fill_value)
if not np.isnan(fill_value) and (fill_value_casted != fill_value):
def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if is_scalar(fill_value):
cudf_obj = cudf.Scalar(fill_value)
if not as_column(cudf_obj).can_cast_safely(self.dtype):
raise TypeError(
f"Cannot safely cast non-equivalent "
f"{type(fill_value).__name__} to {col.dtype.name}"
f"{type(fill_value).__name__} to {self.dtype.name}"
)
fill_value = cudf.Scalar(fill_value_casted)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
if is_integer_dtype(col.dtype):
# cast safely to the same dtype as self
if fill_value.dtype != col.dtype:
new_fill_value = fill_value.astype(col.dtype)
if not (new_fill_value == fill_value).all():
raise TypeError(
f"Cannot safely cast non-equivalent "
f"{fill_value.dtype.type.__name__} to "
f"{col.dtype.type.__name__}"
)
fill_value = new_fill_value
else:
fill_value = fill_value.astype(col.dtype)

return super().fillna(fill_value, method)
cudf_obj = as_column(fill_value, nan_as_null=False)
if not cudf_obj.can_cast_safely(self.dtype): # type: ignore[attr-defined]
raise TypeError(
f"Cannot safely cast non-equivalent "
f"{cudf_obj.dtype.type.__name__} to "
f"{self.dtype.type.__name__}"
)
return cudf_obj.astype(self.dtype)

def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
"""
Expand Down
18 changes: 1 addition & 17 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import re
import warnings
from functools import cached_property
from typing import TYPE_CHECKING, Any, Sequence, cast, overload
from typing import TYPE_CHECKING, Sequence, cast, overload

import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
import cudf.api.types
Expand Down Expand Up @@ -5838,21 +5837,6 @@ def find_and_replace(
res = self
return libcudf.replace.replace(res, df._data["old"], df._data["new"])

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
if fill_value is not None:
if not is_scalar(fill_value):
fill_value = column.as_column(fill_value, dtype=self.dtype)
elif cudf._lib.scalar._is_null_host_scalar(fill_value):
# Trying to fill <NA> with <NA> value? Return copy.
return self.copy(deep=True)
else:
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
return super().fillna(fill_value, method=method)

def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
if (
isinstance(other, (column.ColumnBase, cudf.Scalar))
Expand Down
19 changes: 1 addition & 18 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@

import datetime
import functools
from typing import TYPE_CHECKING, Any, Sequence, cast
from typing import TYPE_CHECKING, Sequence, cast

import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -252,22 +251,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
def time_unit(self) -> str:
return np.datetime_data(self.dtype)[0]

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return self.copy(deep=True)
if is_scalar(fill_value):
fill_value = cudf.Scalar(fill_value)
dtype = self.dtype
fill_value = fill_value.astype(dtype)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
return super().fillna(fill_value, method)

def as_numerical_column(
self, dtype: Dtype
) -> "cudf.core.column.NumericalColumn":
Expand Down
Loading

0 comments on commit bc08662

Please sign in to comment.