From 5d244dfc13f4db0b1e41ded3029942fec50c98f6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 7 May 2024 15:52:18 -0500 Subject: [PATCH] Preserve sub-second data for time scalars in column construction (#15655) Fixes: #15654 This PR makes fixes such that sub-second timestamp data is not being dropped in column construction. Forks out of https://github.com/rapidsai/cudf/pull/14534/ Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15655 --- python/cudf/cudf/_lib/scalar.pyx | 3 ++- python/cudf/cudf/core/column/column.py | 13 ++++++++++++ python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/scalar.py | 3 +++ python/cudf/cudf/tests/test_series.py | 28 ++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 7ddf4ff4883..aee496e9f1c 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None): def _is_null_host_scalar(slr): if cudf.utils.utils.is_na_like(slr): return True - elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr): + elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ + slr is pd.NaT: return True else: return False diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 553f4cc7fb3..e23da59b883 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2163,6 +2163,19 @@ def as_column( nan_as_null=nan_as_null, length=length, ) + elif ( + isinstance(element, (pd.Timestamp, pd.Timedelta)) + or element is pd.NaT + ): + # TODO: Remove this after + # https://github.com/apache/arrow/issues/26492 + # is fixed. + return as_column( + pd.Series(arbitrary), + dtype=dtype, + nan_as_null=nan_as_null, + length=length, + ) elif not any(element is na for na in (None, pd.NA, np.nan)): # Might have NA + element like above, but short-circuit if # an element pyarrow/pandas might be able to parse diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bf8201e4dc1..6fa957684e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1215,7 +1215,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[us] + datetime datetime64[ns] string object dtype: object """ diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f7d05e53ce7..29460d8c67e 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype): if dtype is None: if not valid: + if value is NaT: + value = value.to_numpy() + if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 642dbde3790..6a9de197374 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2786,3 +2786,31 @@ def test_squeeze(axis, data): def test_squeeze_invalid_axis(axis): with pytest.raises(ValueError): cudf.Series([1]).squeeze(axis=axis) + + +@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) +def test_timestamp_series_init(data): + scalar = pd.Timestamp(data) + expected = pd.Series([scalar]) + actual = cudf.Series([scalar]) + + assert_eq(expected, actual) + + expected = pd.Series(scalar) + actual = cudf.Series(scalar) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) +def test_timedelta_series_init(data): + scalar = pd.Timedelta(data) + expected = pd.Series([scalar]) + actual = cudf.Series([scalar]) + + assert_eq(expected, actual) + + expected = pd.Series(scalar) + actual = cudf.Series(scalar) + + assert_eq(expected, actual)