From 39253f14478893cd9e65c17e964f65b3e86f5afd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Jun 2024 13:57:09 -0700 Subject: [PATCH 1/2] Support timezone aware pandas inputs in cudf --- python/cudf/cudf/core/column/column.py | 27 +++++-------------- python/cudf/cudf/core/index.py | 11 +++----- .../cudf/tests/series/test_datetimelike.py | 13 +++++++++ python/cudf/cudf/tests/test_datetime.py | 26 +++--------------- .../dask_cudf/io/tests/test_parquet.py | 21 +++++++++++++++ 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 68079371b85..c8b2d4eba08 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: "yet supported in pyarrow, see: " "https://github.com/apache/arrow/issues/20213" ) - elif pa.types.is_timestamp(array.type) and array.type.tz is not None: - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) elif pa.types.is_large_string(array.type): @@ -990,9 +986,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: return col elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): return col.as_decimal_column(dtype) - elif np.issubdtype(cast(Any, dtype), np.datetime64): + elif dtype.kind == "M": return col.as_datetime_column(dtype) - elif np.issubdtype(cast(Any, dtype), np.timedelta64): + elif dtype.kind == "m": return col.as_timedelta_column(dtype) elif dtype.kind == "O": if cudf.get_option("mode.pandas_compatible") and was_object: @@ -1842,21 +1838,11 @@ def as_column( and arbitrary.freq is not None ): raise NotImplementedError("freq is not implemented yet") - elif ( - isinstance(arbitrary.dtype, pd.DatetimeTZDtype) - or ( - isinstance(arbitrary.dtype, pd.IntervalDtype) - and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype) - ) - or ( - isinstance(arbitrary.dtype, pd.CategoricalDtype) - and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ) - ) + elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance( + arbitrary.dtype.subtype, pd.DatetimeTZDtype ): raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" + "cuDF does not yet support Intervals with timezone-aware datetimes" ) elif _is_pandas_nullable_extension_dtype(arbitrary.dtype): if cudf.get_option("mode.pandas_compatible"): @@ -1872,7 +1858,8 @@ def as_column( length=length, ) elif isinstance( - arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype) + arbitrary.dtype, + (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype), ): return as_column( pa.array(arbitrary, from_pandas=True), diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7297ac4e929..732e5cdb01a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1757,13 +1757,10 @@ def __init__( name = _getdefault_name(data, name=name) data = column.as_column(data) - # TODO: Remove this if statement and fix tests now that - # there's timezone support - if isinstance(data.dtype, pd.DatetimeTZDtype): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - data = data.astype(dtype) + # TODO: if data.dtype.kind == "M" (i.e. data is already datetime type) + # We probably shouldn't always astype to datetime64[ns] + if not isinstance(data.dtype, pd.DatetimeTZDtype): + data = data.astype(dtype) if copy: data = data.copy() diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 7ef55761b2b..58ffc610c3c 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected): def test_tz_convert_naive_typeerror(): with pytest.raises(TypeError): cudf.date_range("2020", periods=2, freq="D").tz_convert(None) + + +@pytest.mark.parametrize( + "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] +) +def test_from_pandas_obj_tz_aware(klass): + tz_aware_data = [ + pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific") + ] + pandas_obj = getattr(pd, klass)(tz_aware_data) + result = cudf.from_pandas(pandas_obj) + expected = getattr(cudf, klass)(tz_aware_data) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4186fff038a..e3ecaafae5b 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype): assert_eq(expected, actual) -@pytest.mark.parametrize( - "data", - [ - [pd.Timestamp("2001-01-01", tz="America/New_York")], - pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"), - pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"), - ], -) -def test_construction_from_tz_timestamps(data): - with pytest.raises(NotImplementedError): - _ = cudf.Series(data) - with pytest.raises(NotImplementedError): - _ = cudf.Index(data) - with pytest.raises(NotImplementedError): - _ = cudf.DatetimeIndex(data) - with pytest.raises(NotImplementedError): - cudf.CategoricalIndex(data) - - @pytest.mark.parametrize("op", _cmpops) def test_datetime_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") @@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr): t.dt.ceil(freqstr) -def test_timezone_array_notimplemented(): +def test_timezone_pyarrow_array(): pa_array = pa.array( [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)], type=pa.timestamp("ns", "UTC"), ) - with pytest.raises(NotImplementedError): - cudf.Series(pa_array) + result = cudf.Series(pa_array) + expected = pa_array.to_pandas() + assert_eq(result, expected) def test_to_datetime_errors_ignore_deprecated(): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 39800145585..faba0307453 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -596,3 +596,24 @@ def test_parquet_read_filter_and_project(tmpdir): # Check result expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) dd.assert_eq(got, expected) + + +def test_timezone_column(tmpdir): + path = str(tmpdir.join("test.parquet")) + pdf = pd.DataFrame( + { + "time": pd.to_datetime( + ["1996-01-02", "1996-12-01"], + utc=True, + ), + "x": [1, 2], + } + ) + pdf.to_parquet(path) + # cudf.read_parquet does not support reading timezone aware types yet, so check dtypes + got = dask_cudf.read_parquet(path).dtypes + expected = pd.Series( + {"time": pd.DatetimeTZDtype("ns", "UTC"), "x": np.dtype(np.int64)}, + dtype=object, + ) + pd.testing.assert_series_equal(got, expected) From 0905a285b2a71a3b867334b72a23f8ad854500bc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 6 Jun 2024 09:26:16 -0700 Subject: [PATCH 2/2] use a better test --- python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index faba0307453..f3e3911e6c7 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -610,10 +610,9 @@ def test_timezone_column(tmpdir): } ) pdf.to_parquet(path) - # cudf.read_parquet does not support reading timezone aware types yet, so check dtypes - got = dask_cudf.read_parquet(path).dtypes - expected = pd.Series( - {"time": pd.DatetimeTZDtype("ns", "UTC"), "x": np.dtype(np.int64)}, - dtype=object, - ) - pd.testing.assert_series_equal(got, expected) + got = dask_cudf.read_parquet(path) + # cudf.read_parquet does not support reading timezone aware types yet + assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC") + got["time"] = got["time"].astype("datetime64[ns]") + expected = cudf.read_parquet(path) + dd.assert_eq(got, expected)