Disable assigning of NaT to non-datetime/timedelta columns (rapidsa…

…i#23) Fixes: rapidsai/cudf#14218 This PR disallows assigning NaT to non-datetime/timedelta columns. Pandas allows this by changing the column to object dtype, which we cannot support.
vuule · Oct 3, 2023 · 9a22053 · 9a22053
1 parent 13dc80a
commit 9a22053
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 36 deletions.
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -31,7 +31,7 @@
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
-from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls
 
 _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
 
@@ -308,13 +308,16 @@ def to_pandas(
         nullable: bool = False,
         **kwargs,
     ) -> "cudf.Series":
-        # Workaround until following issue is fixed:
+        # `copy=True` workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
-        # Pandas supports only `datetime64[ns]`, hence the cast.
+        # Pandas only supports `datetime64[ns]` dtype
+        # and conversion to this type is necessary to make
+        # arrow to pandas conversion happen for large values.
         return pd.Series(
-            self.astype("datetime64[ns]").fillna("NaT").values_host,
-            copy=False,
+            self.astype("datetime64[ns]").to_arrow(),
+            copy=True,
+            dtype=self.dtype,
             index=index,
         )
 
@@ -590,7 +593,7 @@ def fillna(
     ) -> DatetimeColumn:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
-                return _fillna_natwise(self)
+                return self.copy(deep=True)
             if is_scalar(fill_value):
                 if not isinstance(fill_value, cudf.Scalar):
                     fill_value = cudf.Scalar(fill_value, dtype=self.dtype)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -16,7 +16,7 @@
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
-from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls
 
 _dtype_to_format_conversion = {
     "timedelta64[ns]": "%D days %H:%M:%S",
@@ -146,20 +146,19 @@ def to_arrow(self) -> pa.Array:
     def to_pandas(
         self, index=None, nullable: bool = False, **kwargs
     ) -> pd.Series:
-        # Workaround until following issue is fixed:
+        # `copy=True` workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
-        # Pandas supports only `timedelta64[ns]`, hence the cast.
-        pd_series = pd.Series(
-            self.astype("timedelta64[ns]").fillna("NaT").values_host,
-            copy=False,
+        # Pandas only supports `timedelta64[ns]` dtype
+        # and conversion to this type is necessary to make
+        # arrow to pandas conversion happen for large values.
+        return pd.Series(
+            self.astype("timedelta64[ns]").to_arrow(),
+            copy=True,
+            dtype=self.dtype,
+            index=index,
         )
 
-        if index is not None:
-            pd_series.index = index
-
-        return pd_series
-
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         reflect, op = self._check_reflected_op(op)
         other = self._wrap_binop_normalization(other)
@@ -283,7 +282,7 @@ def fillna(
     ) -> TimeDeltaColumn:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
-                return _fillna_natwise(self)
+                return self.copy(deep=True)
             col: ColumnBase = self
             if is_scalar(fill_value):
                 if isinstance(fill_value, np.timedelta64):

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -82,6 +82,7 @@
 from cudf.core.resample import SeriesResampler
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.core.udf.scalar_function import _get_scalar_kernel
+from cudf.errors import MixedTypeError
 from cudf.utils import docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -211,6 +212,27 @@ def __setitem__(self, key, value):
         # coerce value into a scalar or column
         if is_scalar(value):
             value = to_cudf_compatible_scalar(value)
+            if (
+                not isinstance(
+                    self._frame._column,
+                    (
+                        cudf.core.column.DatetimeColumn,
+                        cudf.core.column.TimeDeltaColumn,
+                    ),
+                )
+                and cudf.utils.utils._isnat(value)
+                and not (
+                    isinstance(
+                        self._frame._column, cudf.core.column.StringColumn
+                    )
+                    and isinstance(value, str)
+                )
+            ):
+                raise MixedTypeError(
+                    f"Cannot assign {value=} to non-datetime/non-timedelta "
+                    "columns"
+                )
+
         elif not (
             isinstance(value, (list, dict))
             and isinstance(

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
@@ -1158,6 +1158,7 @@ def test_datetime_invalid_ops():
         np.datetime64("2005-02-25"),
         np.datetime64("2005-02-25T03:30"),
         np.datetime64("nat"),
+        "NaT",
     ],
 )
 def test_datetime_fillna(data, dtype, fill_value):

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2341,6 +2341,32 @@ def test_series_count_invalid_param():
         s.count(skipna=True)
 
 
+@pytest.mark.parametrize(
+    "data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]]
+)
+def test_series_setitem_nat_with_non_datetimes(data):
+    s = cudf.Series(data)
+    with pytest.raises(TypeError):
+        s[0] = cudf.NaT
+
+
+def test_series_string_setitem():
+    gs = cudf.Series(["abc", "def", "ghi", "xyz", "pqr"])
+    ps = gs.to_pandas()
+
+    gs[0] = "NaT"
+    gs[1] = "NA"
+    gs[2] = "<NA>"
+    gs[3] = "NaN"
+
+    ps[0] = "NaT"
+    ps[1] = "NA"
+    ps[2] = "<NA>"
+    ps[3] = "NaN"
+
+    assert_eq(gs, ps)
+
+
 def test_multi_dim_series_error():
     arr = cp.array([(1, 2), (3, 4)])
     with pytest.raises(ValueError):

diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
@@ -1030,6 +1030,7 @@ def local_assert(expected, actual):
         np.timedelta64(1, "ms"),
         np.timedelta64(1, "us"),
         np.timedelta64(1, "ns"),
+        "NaT",
     ],
 )
 def test_timedelta_fillna(data, dtype, fill_value):

diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
@@ -10,6 +10,7 @@
 from typing import FrozenSet, Set, Union
 
 import numpy as np
+import pandas as pd
 from nvtx import annotate
 
 import rmm
@@ -264,26 +265,15 @@ def pa_mask_buffer_to_mask(mask_buf, size):
 
 def _isnat(val):
     """Wraps np.isnat to return False instead of error on invalid inputs."""
-    if not isinstance(val, (np.datetime64, np.timedelta64, str)):
+    if val is pd.NaT:
+        return True
+    elif not isinstance(val, (np.datetime64, np.timedelta64, str)):
         return False
     else:
-        return val in {"NaT", "NAT"} or np.isnat(val)
-
-
-def _fillna_natwise(col):
-    # If the value we are filling is np.datetime64("NAT")
-    # we set the same mask as current column.
-    # However where there are "<NA>" in the
-    # columns, their corresponding locations
-    nat = cudf._lib.scalar._create_proxy_nat_scalar(col.dtype)
-    result = cudf._lib.replace.replace_nulls(col, nat)
-    return column.build_column(
-        data=result.base_data,
-        dtype=result.dtype,
-        size=result.size,
-        offset=result.offset,
-        children=result.base_children,
-    )
+        try:
+            return val in {"NaT", "NAT"} or np.isnat(val)
+        except TypeError:
+            return False
 
 
 def search_range(x: int, ri: range, *, side: str) -> int: