Skip to content

Commit

Permalink
Support timezone aware pandas inputs in cudf (#15935)
Browse files Browse the repository at this point in the history
closes #13611

(This technically does not support pandas objects have interval types that are timezone aware)

@rjzamora let me know if the test I adapted from your PR in #15929 is adequate

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15935
  • Loading branch information
mroeschke authored Jun 10, 2024
1 parent 9b2c35f commit e3ba131
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 49 deletions.
27 changes: 7 additions & 20 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
"yet supported in pyarrow, see: "
"https://github.com/apache/arrow/issues/20213"
)
elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif pa.types.is_large_string(array.type):
Expand Down Expand Up @@ -992,9 +988,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
return col
elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
return col.as_decimal_column(dtype)
elif np.issubdtype(cast(Any, dtype), np.datetime64):
elif dtype.kind == "M":
return col.as_datetime_column(dtype)
elif np.issubdtype(cast(Any, dtype), np.timedelta64):
elif dtype.kind == "m":
return col.as_timedelta_column(dtype)
elif dtype.kind == "O":
if cudf.get_option("mode.pandas_compatible") and was_object:
Expand Down Expand Up @@ -1846,21 +1842,11 @@ def as_column(
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
elif (
isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
or (
isinstance(arbitrary.dtype, pd.IntervalDtype)
and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary.dtype, pd.CategoricalDtype)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
)
elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance(
arbitrary.dtype.subtype, pd.DatetimeTZDtype
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
"cuDF does not yet support Intervals with timezone-aware datetimes"
)
elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
if cudf.get_option("mode.pandas_compatible"):
Expand All @@ -1876,7 +1862,8 @@ def as_column(
length=length,
)
elif isinstance(
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
arbitrary.dtype,
(pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype),
):
return as_column(
pa.array(arbitrary, from_pandas=True),
Expand Down
11 changes: 4 additions & 7 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1757,13 +1757,10 @@ def __init__(
name = _getdefault_name(data, name=name)
data = column.as_column(data)

# TODO: Remove this if statement and fix tests now that
# there's timezone support
if isinstance(data.dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = data.astype(dtype)
# TODO: if data.dtype.kind == "M" (i.e. data is already datetime type)
# We probably shouldn't always astype to datetime64[ns]
if not isinstance(data.dtype, pd.DatetimeTZDtype):
data = data.astype(dtype)

if copy:
data = data.copy()
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected):
def test_tz_convert_naive_typeerror():
with pytest.raises(TypeError):
cudf.date_range("2020", periods=2, freq="D").tz_convert(None)


@pytest.mark.parametrize(
"klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
)
def test_from_pandas_obj_tz_aware(klass):
tz_aware_data = [
pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific")
]
pandas_obj = getattr(pd, klass)(tz_aware_data)
result = cudf.from_pandas(pandas_obj)
expected = getattr(cudf, klass)(tz_aware_data)
assert_eq(result, expected)
26 changes: 4 additions & 22 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype):
assert_eq(expected, actual)


@pytest.mark.parametrize(
"data",
[
[pd.Timestamp("2001-01-01", tz="America/New_York")],
pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
],
)
def test_construction_from_tz_timestamps(data):
with pytest.raises(NotImplementedError):
_ = cudf.Series(data)
with pytest.raises(NotImplementedError):
_ = cudf.Index(data)
with pytest.raises(NotImplementedError):
_ = cudf.DatetimeIndex(data)
with pytest.raises(NotImplementedError):
cudf.CategoricalIndex(data)


@pytest.mark.parametrize("op", _cmpops)
def test_datetime_binop_tz_timestamp(op):
s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
Expand Down Expand Up @@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr):
t.dt.ceil(freqstr)


def test_timezone_array_notimplemented():
def test_timezone_pyarrow_array():
pa_array = pa.array(
[datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)],
type=pa.timestamp("ns", "UTC"),
)
with pytest.raises(NotImplementedError):
cudf.Series(pa_array)
result = cudf.Series(pa_array)
expected = pa_array.to_pandas()
assert_eq(result, expected)


def test_to_datetime_errors_ignore_deprecated():
Expand Down
20 changes: 20 additions & 0 deletions python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,3 +596,23 @@ def test_parquet_read_filter_and_project(tmpdir):
# Check result
expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True)
dd.assert_eq(got, expected)


def test_timezone_column(tmpdir):
path = str(tmpdir.join("test.parquet"))
pdf = pd.DataFrame(
{
"time": pd.to_datetime(
["1996-01-02", "1996-12-01"],
utc=True,
),
"x": [1, 2],
}
)
pdf.to_parquet(path)
got = dask_cudf.read_parquet(path)
# cudf.read_parquet does not support reading timezone aware types yet
assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC")
got["time"] = got["time"].astype("datetime64[ns]")
expected = cudf.read_parquet(path)
dd.assert_eq(got, expected)

0 comments on commit e3ba131

Please sign in to comment.