Skip to content

Commit

Permalink
Disable assigning of NaT to non-datetime/timedelta columns (rapidsa…
Browse files Browse the repository at this point in the history
…i#23)

Fixes: rapidsai/cudf#14218

This PR disallows assigning NaT to non-datetime/timedelta columns. Pandas allows this by changing the column to object dtype, which we cannot support.
  • Loading branch information
galipremsagar authored Oct 3, 2023
1 parent 13dc80a commit 9a22053
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 36 deletions.
15 changes: 9 additions & 6 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from cudf.core.column import ColumnBase, as_column, column, string
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
from cudf.utils.utils import _all_bools_with_nulls

_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format

Expand Down Expand Up @@ -308,13 +308,16 @@ def to_pandas(
nullable: bool = False,
**kwargs,
) -> "cudf.Series":
# Workaround until following issue is fixed:
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772

# Pandas supports only `datetime64[ns]`, hence the cast.
# Pandas only supports `datetime64[ns]` dtype
# and conversion to this type is necessary to make
# arrow to pandas conversion happen for large values.
return pd.Series(
self.astype("datetime64[ns]").fillna("NaT").values_host,
copy=False,
self.astype("datetime64[ns]").to_arrow(),
copy=True,
dtype=self.dtype,
index=index,
)

Expand Down Expand Up @@ -590,7 +593,7 @@ def fillna(
) -> DatetimeColumn:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return _fillna_natwise(self)
return self.copy(deep=True)
if is_scalar(fill_value):
if not isinstance(fill_value, cudf.Scalar):
fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
Expand Down
23 changes: 11 additions & 12 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column import ColumnBase, column, string
from cudf.utils.dtypes import np_to_pa_dtype
from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
from cudf.utils.utils import _all_bools_with_nulls

_dtype_to_format_conversion = {
"timedelta64[ns]": "%D days %H:%M:%S",
Expand Down Expand Up @@ -146,20 +146,19 @@ def to_arrow(self) -> pa.Array:
def to_pandas(
self, index=None, nullable: bool = False, **kwargs
) -> pd.Series:
# Workaround until following issue is fixed:
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772

# Pandas supports only `timedelta64[ns]`, hence the cast.
pd_series = pd.Series(
self.astype("timedelta64[ns]").fillna("NaT").values_host,
copy=False,
# Pandas only supports `timedelta64[ns]` dtype
# and conversion to this type is necessary to make
# arrow to pandas conversion happen for large values.
return pd.Series(
self.astype("timedelta64[ns]").to_arrow(),
copy=True,
dtype=self.dtype,
index=index,
)

if index is not None:
pd_series.index = index

return pd_series

def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
reflect, op = self._check_reflected_op(op)
other = self._wrap_binop_normalization(other)
Expand Down Expand Up @@ -283,7 +282,7 @@ def fillna(
) -> TimeDeltaColumn:
if fill_value is not None:
if cudf.utils.utils._isnat(fill_value):
return _fillna_natwise(self)
return self.copy(deep=True)
col: ColumnBase = self
if is_scalar(fill_value):
if isinstance(fill_value, np.timedelta64):
Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
from cudf.core.resample import SeriesResampler
from cudf.core.single_column_frame import SingleColumnFrame
from cudf.core.udf.scalar_function import _get_scalar_kernel
from cudf.errors import MixedTypeError
from cudf.utils import docutils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -211,6 +212,27 @@ def __setitem__(self, key, value):
# coerce value into a scalar or column
if is_scalar(value):
value = to_cudf_compatible_scalar(value)
if (
not isinstance(
self._frame._column,
(
cudf.core.column.DatetimeColumn,
cudf.core.column.TimeDeltaColumn,
),
)
and cudf.utils.utils._isnat(value)
and not (
isinstance(
self._frame._column, cudf.core.column.StringColumn
)
and isinstance(value, str)
)
):
raise MixedTypeError(
f"Cannot assign {value=} to non-datetime/non-timedelta "
"columns"
)

elif not (
isinstance(value, (list, dict))
and isinstance(
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,7 @@ def test_datetime_invalid_ops():
np.datetime64("2005-02-25"),
np.datetime64("2005-02-25T03:30"),
np.datetime64("nat"),
"NaT",
],
)
def test_datetime_fillna(data, dtype, fill_value):
Expand Down
26 changes: 26 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2341,6 +2341,32 @@ def test_series_count_invalid_param():
s.count(skipna=True)


@pytest.mark.parametrize(
"data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]]
)
def test_series_setitem_nat_with_non_datetimes(data):
s = cudf.Series(data)
with pytest.raises(TypeError):
s[0] = cudf.NaT


def test_series_string_setitem():
gs = cudf.Series(["abc", "def", "ghi", "xyz", "pqr"])
ps = gs.to_pandas()

gs[0] = "NaT"
gs[1] = "NA"
gs[2] = "<NA>"
gs[3] = "NaN"

ps[0] = "NaT"
ps[1] = "NA"
ps[2] = "<NA>"
ps[3] = "NaN"

assert_eq(gs, ps)


def test_multi_dim_series_error():
arr = cp.array([(1, 2), (3, 4)])
with pytest.raises(ValueError):
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/tests/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,7 @@ def local_assert(expected, actual):
np.timedelta64(1, "ms"),
np.timedelta64(1, "us"),
np.timedelta64(1, "ns"),
"NaT",
],
)
def test_timedelta_fillna(data, dtype, fill_value):
Expand Down
26 changes: 8 additions & 18 deletions python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import FrozenSet, Set, Union

import numpy as np
import pandas as pd
from nvtx import annotate

import rmm
Expand Down Expand Up @@ -264,26 +265,15 @@ def pa_mask_buffer_to_mask(mask_buf, size):

def _isnat(val):
"""Wraps np.isnat to return False instead of error on invalid inputs."""
if not isinstance(val, (np.datetime64, np.timedelta64, str)):
if val is pd.NaT:
return True
elif not isinstance(val, (np.datetime64, np.timedelta64, str)):
return False
else:
return val in {"NaT", "NAT"} or np.isnat(val)


def _fillna_natwise(col):
# If the value we are filling is np.datetime64("NAT")
# we set the same mask as current column.
# However where there are "<NA>" in the
# columns, their corresponding locations
nat = cudf._lib.scalar._create_proxy_nat_scalar(col.dtype)
result = cudf._lib.replace.replace_nulls(col, nat)
return column.build_column(
data=result.base_data,
dtype=result.dtype,
size=result.size,
offset=result.offset,
children=result.base_children,
)
try:
return val in {"NaT", "NAT"} or np.isnat(val)
except TypeError:
return False


def search_range(x: int, ri: range, *, side: str) -> int:
Expand Down

0 comments on commit 9a22053

Please sign in to comment.