Preserve sub-second data for time scalars in column construction (#15655

) Fixes: #15654 This PR makes fixes such that sub-second timestamp data is not being dropped in column construction. Forks out of #14534 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #15655
rapidsai · May 7, 2024 · 5d244df · 5d244df
1 parent 8d9c06a
commit 5d244df
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 2 deletions.
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
@@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None):
 def _is_null_host_scalar(slr):
     if cudf.utils.utils.is_na_like(slr):
         return True
-    elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
+    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
+            slr is pd.NaT:
         return True
     else:
         return False

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2163,6 +2163,19 @@ def as_column(
                     nan_as_null=nan_as_null,
                     length=length,
                 )
+            elif (
+                isinstance(element, (pd.Timestamp, pd.Timedelta))
+                or element is pd.NaT
+            ):
+                # TODO: Remove this after
+                # https://github.com/apache/arrow/issues/26492
+                # is fixed.
+                return as_column(
+                    pd.Series(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
             elif not any(element is na for na in (None, pd.NA, np.nan)):
                 # Might have NA + element like above, but short-circuit if
                 # an element pyarrow/pandas might be able to parse

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -1215,7 +1215,7 @@ def dtypes(self):
         >>> df.dtypes
         float              float64
         int                  int64
-        datetime    datetime64[us]
+        datetime    datetime64[ns]
         string              object
         dtype: object
         """

diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
@@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype):
 
         if dtype is None:
             if not valid:
+                if value is NaT:
+                    value = value.to_numpy()
+
                 if isinstance(value, (np.datetime64, np.timedelta64)):
                     unit, _ = np.datetime_data(value)
                     if unit == "generic":

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2786,3 +2786,31 @@ def test_squeeze(axis, data):
 def test_squeeze_invalid_axis(axis):
     with pytest.raises(ValueError):
         cudf.Series([1]).squeeze(axis=axis)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timestamp_series_init(data):
+    scalar = pd.Timestamp(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timedelta_series_init(data):
+    scalar = pd.Timedelta(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)