Skip to content

Commit

Permalink
Raise NotImplementedError for Categoricals with timezones (#14032)
Browse files Browse the repository at this point in the history
Currently `cudf.from_pandas` with a pandas Categorical with datetimetz type will drop the timezone information (due to pyarrow)

```python
In [5]: import pandas as pd

In [6]: ci = pd.CategoricalIndex(pd.date_range("2016-01-01 01:01:00", periods=5, freq="D").tz_localize("UTC"))

In [7]: ci
Out[7]: 
CategoricalIndex(['2016-01-01 01:01:00+00:00', '2016-01-02 01:01:00+00:00',
                  '2016-01-03 01:01:00+00:00', '2016-01-04 01:01:00+00:00',
                  '2016-01-05 01:01:00+00:00'],
                 categories=[2016-01-01 01:01:00+00:00, 2016-01-02 01:01:00+00:00, 2016-01-03 01:01:00+00:00, 2016-01-04 01:01:00+00:00, 2016-01-05 01:01:00+00:00], ordered=False, dtype='category')

In [8]: ci_cudf = cudf.from_pandas(ci)

In [10]: ci_cudf
Out[10]: 
CategoricalIndex(['2016-01-01 01:01:00', '2016-01-02 01:01:00',
                  '2016-01-03 01:01:00', '2016-01-04 01:01:00',
                  '2016-01-05 01:01:00'],
                 categories=[2016-01-01 01:01:00, 2016-01-02 01:01:00, 2016-01-03 01:01:00, 2016-01-04 01:01:00, 2016-01-05 01:01:00], ordered=False, dtype='category')
```

Like what is done with `IntervalIndex`, raises a `NotImplementedError` for now to avoid this wrong behavior.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14032
  • Loading branch information
mroeschke authored Sep 4, 2023
1 parent c516336 commit 3e5f019
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 11 deletions.
31 changes: 25 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,9 +2033,19 @@ def as_column(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
if is_categorical_dtype(arbitrary):
if is_categorical_dtype(arbitrary.dtype):
if isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif is_interval_dtype(arbitrary.dtype):
if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif arbitrary.dtype == np.bool_:
data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
Expand Down Expand Up @@ -2262,11 +2272,20 @@ def as_column(
elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
elif (
isinstance(arbitrary, pd.DatetimeIndex)
and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
) or (
isinstance(arbitrary, pd.IntervalIndex)
and is_datetime64tz_dtype(arbitrary.dtype.subtype)
(
isinstance(arbitrary, pd.DatetimeIndex)
and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary, pd.IntervalIndex)
and is_datetime64tz_dtype(arbitrary.dtype.subtype)
)
or (
isinstance(arbitrary, pd.CategoricalIndex)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2095,6 +2095,8 @@ def test_construction_from_tz_timestamps(data):
_ = cudf.Index(data)
with pytest.raises(NotImplementedError):
_ = cudf.DatetimeIndex(data)
with pytest.raises(NotImplementedError):
cudf.CategoricalIndex(data)


@pytest.mark.parametrize("op", _cmpops)
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/tests/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,18 @@ def test_interval_index_unique():
assert_eq(expected, actual)


@pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex])
@pytest.mark.parametrize("tz", ["US/Eastern", None])
def test_interval_with_datetime(tz):
def test_interval_with_datetime(tz, box):
dti = pd.date_range(
start=pd.Timestamp("20180101", tz=tz),
end=pd.Timestamp("20181231", tz=tz),
freq="M",
)
pidx = pd.IntervalIndex.from_breaks(dti)
pobj = box(pd.IntervalIndex.from_breaks(dti))
if tz is None:
gidx = cudf.from_pandas(pidx)
assert_eq(pidx, gidx)
gobj = cudf.from_pandas(pobj)
assert_eq(pobj, gobj)
else:
with pytest.raises(NotImplementedError):
cudf.from_pandas(pidx)
cudf.from_pandas(pobj)

0 comments on commit 3e5f019

Please sign in to comment.