From a387520dfb00bc86fdd0838326c4898de11d6093 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sun, 5 May 2024 08:41:34 +0000 Subject: [PATCH 1/4] Preserve sub-second data for time scalars in column construction --- python/cudf/cudf/core/column/column.py | 9 +++++++++ python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_series.py | 28 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ba2dab2c2e1..4df40eb0f19 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2163,6 +2163,15 @@ def as_column( nan_as_null=nan_as_null, length=length, ) + elif isinstance( + element, (pd.Timestamp, pd.Timedelta, type(pd.NaT)) + ): + return as_column( + pd.Series(arbitrary), + dtype=dtype, + nan_as_null=nan_as_null, + length=length, + ) elif not any(element is na for na in (None, pd.NA, np.nan)): # Might have NA + element like above, but short-circuit if # an element pyarrow/pandas might be able to parse diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bf8201e4dc1..6fa957684e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1215,7 +1215,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[us] + datetime datetime64[ns] string object dtype: object """ diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 642dbde3790..6a9de197374 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2786,3 +2786,31 @@ def test_squeeze(axis, data): def test_squeeze_invalid_axis(axis): with pytest.raises(ValueError): cudf.Series([1]).squeeze(axis=axis) + + +@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) +def test_timestamp_series_init(data): + scalar = pd.Timestamp(data) + expected = pd.Series([scalar]) + actual = cudf.Series([scalar]) + + assert_eq(expected, actual) + + expected = pd.Series(scalar) + actual = cudf.Series(scalar) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) +def test_timedelta_series_init(data): + scalar = pd.Timedelta(data) + expected = pd.Series([scalar]) + actual = cudf.Series([scalar]) + + assert_eq(expected, actual) + + expected = pd.Series(scalar) + actual = cudf.Series(scalar) + + assert_eq(expected, actual) From e61a561e9b705a2221882ab800b0d9252e2627da Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sun, 5 May 2024 13:56:03 +0000 Subject: [PATCH 2/4] Fix NaT handling in Scalar constructor --- python/cudf/cudf/_lib/scalar.pyx | 3 ++- python/cudf/cudf/core/scalar.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 7ddf4ff4883..d47d01eea06 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None): def _is_null_host_scalar(slr): if cudf.utils.utils.is_na_like(slr): return True - elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr): + elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ + isinstance(slr, type(pd.NaT)): return True else: return False diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f7d05e53ce7..eb038534df3 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype): if dtype is None: if not valid: + if isinstance(value, type(NaT)): + value = value.to_numpy() + if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": From 5866ab10584b143220492748eba06ffd30e0c2ca Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 6 May 2024 14:06:12 -0500 Subject: [PATCH 3/4] Update column.py --- python/cudf/cudf/core/column/column.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 4df40eb0f19..2b56a9d42cb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2166,6 +2166,9 @@ def as_column( elif isinstance( element, (pd.Timestamp, pd.Timedelta, type(pd.NaT)) ): + # TODO: Remove this after + # https://github.com/apache/arrow/issues/26492 + # is fixed. return as_column( pd.Series(arbitrary), dtype=dtype, From 1540738a6105658d65978e46f19b9c3e25080941 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 6 May 2024 19:15:11 +0000 Subject: [PATCH 4/4] use is instead of isinstance --- python/cudf/cudf/_lib/scalar.pyx | 2 +- python/cudf/cudf/core/column/column.py | 5 +++-- python/cudf/cudf/core/scalar.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index d47d01eea06..aee496e9f1c 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -355,7 +355,7 @@ def _is_null_host_scalar(slr): if cudf.utils.utils.is_na_like(slr): return True elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ - isinstance(slr, type(pd.NaT)): + slr is pd.NaT: return True else: return False diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2b56a9d42cb..a890603ebaf 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2163,8 +2163,9 @@ def as_column( nan_as_null=nan_as_null, length=length, ) - elif isinstance( - element, (pd.Timestamp, pd.Timedelta, type(pd.NaT)) + elif ( + isinstance(element, (pd.Timestamp, pd.Timedelta)) + or element is pd.NaT ): # TODO: Remove this after # https://github.com/apache/arrow/issues/26492 diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index eb038534df3..29460d8c67e 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -223,7 +223,7 @@ def _preprocess_host_value(self, value, dtype): if dtype is None: if not valid: - if isinstance(value, type(NaT)): + if value is NaT: value = value.to_numpy() if isinstance(value, (np.datetime64, np.timedelta64)):