Skip to content

Commit

Permalink
Define Column.nan_as_null to return self (#15923)
Browse files Browse the repository at this point in the history
While trying to clean all the `fillna` logic, I needed to have a `Column.nan_as_null` defined to make the `fillna` logic more re-useable.

This allows other `nan_as_null` usages in cudf to avoiding checking whether it's defined on the column or not.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15923
  • Loading branch information
mroeschke authored Jun 7, 2024
1 parent 9bd16bb commit d83d086
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 43 deletions.
7 changes: 1 addition & 6 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2072,12 +2072,7 @@ def dropna(self, how="any"):
pass
# This is to be consistent with IndexedFrame.dropna to handle nans
# as nulls by default
data_columns = [
col.nans_to_nulls()
if isinstance(col, cudf.core.column.NumericalColumn)
else col
for col in self._columns
]
data_columns = [col.nans_to_nulls() for col in self._columns]

return self._from_columns_like_self(
drop_nulls(
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,10 +816,8 @@ def to_pandas(
.values_host
)

cats = col.categories
if cats.dtype.kind in "biuf":
cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined]
elif not isinstance(cats.dtype, IntervalDtype):
cats = col.categories.nans_to_nulls()
if not isinstance(cats.dtype, IntervalDtype):
# leaving out dropna because it temporarily changes an interval
# index into a struct and throws off results.
# TODO: work on interval index dropna
Expand Down
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def any(self, skipna: bool = True) -> bool:

return libcudf.reduce.reduce("any", self, dtype=np.bool_)

def dropna(self) -> ColumnBase:
def dropna(self) -> Self:
return drop_nulls([self])[0]._with_type_metadata(self.dtype)

def to_arrow(self) -> pa.Array:
Expand Down Expand Up @@ -695,7 +695,9 @@ def fillna(
Returns a copy with null filled.
"""
return libcudf.replace.replace_nulls(
input_col=self, replacement=fill_value, method=method
input_col=self.nans_to_nulls(),
replacement=fill_value,
method=method,
)._with_type_metadata(self.dtype)

def isnull(self) -> ColumnBase:
Expand Down Expand Up @@ -1240,6 +1242,10 @@ def unary_operator(self, unaryop: str):
f"Operation {unaryop} not supported for dtype {self.dtype}."
)

def nans_to_nulls(self: Self) -> Self:
"""Convert NaN to NA."""
return self

def normalize_binop_value(
self, other: ScalarLike
) -> Union[ColumnBase, ScalarLike]:
Expand Down Expand Up @@ -1802,9 +1808,7 @@ def as_column(

data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
col = build_column(data, dtype=arbitrary.dtype, mask=mask)
if (
nan_as_null or (mask is None and nan_as_null is None)
) and col.dtype.kind == "f":
if nan_as_null or (mask is None and nan_as_null is None):
col = col.nans_to_nulls()
if dtype is not None:
col = col.astype(dtype)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ def fillna(
return col

if method is not None:
return super(NumericalColumn, col).fillna(fill_value, method)
return super().fillna(fill_value, method)

if fill_value is None:
raise ValueError("Must specify either 'fill_value' or 'method'")
Expand All @@ -545,7 +545,7 @@ def fillna(
isinstance(fill_value, cudf.Scalar)
and fill_value.dtype == col.dtype
):
return super(NumericalColumn, col).fillna(fill_value, method)
return super().fillna(fill_value, method)

if np.isscalar(fill_value):
# cast safely to the same dtype as self
Expand All @@ -572,7 +572,7 @@ def fillna(
else:
fill_value = fill_value.astype(col.dtype)

return super(NumericalColumn, col).fillna(fill_value, method)
return super().fillna(fill_value, method)

def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
"""
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float:
if len(self) == 0 or self._can_return_nan(skipna=skipna):
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

self = self.nans_to_nulls().dropna() # type: ignore
self = self.nans_to_nulls().dropna()

if len(self) < 4:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
Expand All @@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike:
if len(self) == 0 or self._can_return_nan(skipna=skipna):
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

self = self.nans_to_nulls().dropna() # type: ignore
self = self.nans_to_nulls().dropna()

if len(self) < 3:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
Expand Down
29 changes: 9 additions & 20 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True):
results = {}
for name, col in self._data.items():
if skipna:
try:
result_col = col.nans_to_nulls()
except AttributeError:
result_col = col
result_col = col.nans_to_nulls()
else:
if col.has_nulls(include_nan=True):
first_index = col.isnull().find_first_value(True)
Expand Down Expand Up @@ -1915,12 +1912,12 @@ def nans_to_nulls(self):
1 <NA> 3.14
2 <NA> <NA>
"""
result = (
col.nans_to_nulls()
if isinstance(col, cudf.core.column.NumericalColumn)
else col.copy()
for col in self._data.columns
)
result = []
for col in self._data.columns:
converted = col.nans_to_nulls()
if converted is col:
converted = converted.copy()
result.append(converted)
return self._from_data_like_self(
self._data._from_columns_like_self(result)
)
Expand Down Expand Up @@ -4228,10 +4225,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
thresh = len(df)

for name, col in df._data.items():
try:
check_col = col.nans_to_nulls()
except AttributeError:
check_col = col
check_col = col.nans_to_nulls()
no_threshold_valid_count = (
len(col) - check_col.null_count
) < thresh
Expand Down Expand Up @@ -4261,12 +4255,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
if len(subset) == 0:
return self.copy(deep=True)

data_columns = [
col.nans_to_nulls()
if isinstance(col, cudf.core.column.NumericalColumn)
else col
for col in self._columns
]
data_columns = [col.nans_to_nulls() for col in self._columns]

return self._from_columns_like_self(
libcudf.stream_compaction.drop_nulls(
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na):
else:
unique = column.unique().sort_values()
if not dummy_na:
if np.issubdtype(unique.dtype, np.floating):
unique = unique.nans_to_nulls()
unique = unique.dropna()
unique = unique.nans_to_nulls().dropna()
return unique


Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/tests/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest

import cudf
Expand Down Expand Up @@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex():
actual = gdf.fillna(10)

assert_eq(expected, actual)


def test_fillna_nan_and_null():
ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False)
result = ser.fillna(2.2)
expected = cudf.Series([2.2, 2.2, 1.1])
assert_eq(result, expected)
7 changes: 7 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy():
ser1 = cudf.Series(range(3), index=list("abc"))
ser2 = cudf.Series(ser1)
assert ser1.index is ser2.index


@pytest.mark.parametrize("value", [1, 1.1])
def test_nans_to_nulls_noop_copies_column(value):
ser1 = cudf.Series([value])
ser2 = ser1.nans_to_nulls()
assert ser1._column is not ser2._column

0 comments on commit d83d086

Please sign in to comment.