From 3e5f019697252f6c300639a09eb67ff11a80ac43 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Sep 2023 07:11:13 -1000 Subject: [PATCH] Raise NotImplementedError for Categoricals with timezones (#14032) Currently `cudf.from_pandas` with a pandas Categorical with datetimetz type will drop the timezone information (due to pyarrow) ```python In [5]: import pandas as pd In [6]: ci = pd.CategoricalIndex(pd.date_range("2016-01-01 01:01:00", periods=5, freq="D").tz_localize("UTC")) In [7]: ci Out[7]: CategoricalIndex(['2016-01-01 01:01:00+00:00', '2016-01-02 01:01:00+00:00', '2016-01-03 01:01:00+00:00', '2016-01-04 01:01:00+00:00', '2016-01-05 01:01:00+00:00'], categories=[2016-01-01 01:01:00+00:00, 2016-01-02 01:01:00+00:00, 2016-01-03 01:01:00+00:00, 2016-01-04 01:01:00+00:00, 2016-01-05 01:01:00+00:00], ordered=False, dtype='category') In [8]: ci_cudf = cudf.from_pandas(ci) In [10]: ci_cudf Out[10]: CategoricalIndex(['2016-01-01 01:01:00', '2016-01-02 01:01:00', '2016-01-03 01:01:00', '2016-01-04 01:01:00', '2016-01-05 01:01:00'], categories=[2016-01-01 01:01:00, 2016-01-02 01:01:00, 2016-01-03 01:01:00, 2016-01-04 01:01:00, 2016-01-05 01:01:00], ordered=False, dtype='category') ``` Like what is done with `IntervalIndex`, raises a `NotImplementedError` for now to avoid this wrong behavior. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14032 --- python/cudf/cudf/core/column/column.py | 31 ++++++++++++++++++++----- python/cudf/cudf/tests/test_datetime.py | 2 ++ python/cudf/cudf/tests/test_interval.py | 11 +++++---- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ad761ea8d18..9dde17a1045 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2033,9 +2033,19 @@ def as_column( f"{arbitrary.dtype} is not supported. Convert first to " f"{arbitrary.dtype.subtype}." ) - if is_categorical_dtype(arbitrary): + if is_categorical_dtype(arbitrary.dtype): + if isinstance( + arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype + ): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) data = as_column(pa.array(arbitrary, from_pandas=True)) elif is_interval_dtype(arbitrary.dtype): + if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool_: data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype) @@ -2262,11 +2272,20 @@ def as_column( elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray): data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype) elif ( - isinstance(arbitrary, pd.DatetimeIndex) - and isinstance(arbitrary.dtype, pd.DatetimeTZDtype) - ) or ( - isinstance(arbitrary, pd.IntervalIndex) - and is_datetime64tz_dtype(arbitrary.dtype.subtype) + ( + isinstance(arbitrary, pd.DatetimeIndex) + and isinstance(arbitrary.dtype, pd.DatetimeTZDtype) + ) + or ( + isinstance(arbitrary, pd.IntervalIndex) + and is_datetime64tz_dtype(arbitrary.dtype.subtype) + ) + or ( + isinstance(arbitrary, pd.CategoricalIndex) + and isinstance( + arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype + ) + ) ): raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index abcc057f823..b1685950241 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2095,6 +2095,8 @@ def test_construction_from_tz_timestamps(data): _ = cudf.Index(data) with pytest.raises(NotImplementedError): _ = cudf.DatetimeIndex(data) + with pytest.raises(NotImplementedError): + cudf.CategoricalIndex(data) @pytest.mark.parametrize("op", _cmpops) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 9704be44b95..a27de60c2c5 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -167,17 +167,18 @@ def test_interval_index_unique(): assert_eq(expected, actual) +@pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex]) @pytest.mark.parametrize("tz", ["US/Eastern", None]) -def test_interval_with_datetime(tz): +def test_interval_with_datetime(tz, box): dti = pd.date_range( start=pd.Timestamp("20180101", tz=tz), end=pd.Timestamp("20181231", tz=tz), freq="M", ) - pidx = pd.IntervalIndex.from_breaks(dti) + pobj = box(pd.IntervalIndex.from_breaks(dti)) if tz is None: - gidx = cudf.from_pandas(pidx) - assert_eq(pidx, gidx) + gobj = cudf.from_pandas(pobj) + assert_eq(pobj, gobj) else: with pytest.raises(NotImplementedError): - cudf.from_pandas(pidx) + cudf.from_pandas(pobj)