Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support timezone aware pandas inputs in cudf #15935

Merged
merged 3 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 7 additions & 20 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
"yet supported in pyarrow, see: "
"https://github.com/apache/arrow/issues/20213"
)
elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif pa.types.is_large_string(array.type):
Expand Down Expand Up @@ -990,9 +986,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
return col
elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
return col.as_decimal_column(dtype)
elif np.issubdtype(cast(Any, dtype), np.datetime64):
elif dtype.kind == "M":
return col.as_datetime_column(dtype)
elif np.issubdtype(cast(Any, dtype), np.timedelta64):
elif dtype.kind == "m":
return col.as_timedelta_column(dtype)
elif dtype.kind == "O":
if cudf.get_option("mode.pandas_compatible") and was_object:
Expand Down Expand Up @@ -1842,21 +1838,11 @@ def as_column(
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
elif (
isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
or (
isinstance(arbitrary.dtype, pd.IntervalDtype)
and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary.dtype, pd.CategoricalDtype)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
)
elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance(
arbitrary.dtype.subtype, pd.DatetimeTZDtype
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
"cuDF does not yet support Intervals with timezone-aware datetimes"
)
elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
if cudf.get_option("mode.pandas_compatible"):
Expand All @@ -1872,7 +1858,8 @@ def as_column(
length=length,
)
elif isinstance(
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
arbitrary.dtype,
(pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype),
):
return as_column(
pa.array(arbitrary, from_pandas=True),
Expand Down
11 changes: 4 additions & 7 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1757,13 +1757,10 @@ def __init__(
name = _getdefault_name(data, name=name)
data = column.as_column(data)

# TODO: Remove this if statement and fix tests now that
# there's timezone support
if isinstance(data.dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = data.astype(dtype)
# TODO: if data.dtype.kind == "M" (i.e. data is already datetime type)
# We probably shouldn't always astype to datetime64[ns]
if not isinstance(data.dtype, pd.DatetimeTZDtype):
data = data.astype(dtype)

if copy:
data = data.copy()
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected):
def test_tz_convert_naive_typeerror():
with pytest.raises(TypeError):
cudf.date_range("2020", periods=2, freq="D").tz_convert(None)


@pytest.mark.parametrize(
"klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
)
def test_from_pandas_obj_tz_aware(klass):
tz_aware_data = [
pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific")
]
pandas_obj = getattr(pd, klass)(tz_aware_data)
result = cudf.from_pandas(pandas_obj)
expected = getattr(cudf, klass)(tz_aware_data)
assert_eq(result, expected)
26 changes: 4 additions & 22 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype):
assert_eq(expected, actual)


@pytest.mark.parametrize(
"data",
[
[pd.Timestamp("2001-01-01", tz="America/New_York")],
pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
],
)
def test_construction_from_tz_timestamps(data):
with pytest.raises(NotImplementedError):
_ = cudf.Series(data)
with pytest.raises(NotImplementedError):
_ = cudf.Index(data)
with pytest.raises(NotImplementedError):
_ = cudf.DatetimeIndex(data)
with pytest.raises(NotImplementedError):
cudf.CategoricalIndex(data)


@pytest.mark.parametrize("op", _cmpops)
def test_datetime_binop_tz_timestamp(op):
s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
Expand Down Expand Up @@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr):
t.dt.ceil(freqstr)


def test_timezone_array_notimplemented():
def test_timezone_pyarrow_array():
pa_array = pa.array(
[datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)],
type=pa.timestamp("ns", "UTC"),
)
with pytest.raises(NotImplementedError):
cudf.Series(pa_array)
result = cudf.Series(pa_array)
expected = pa_array.to_pandas()
assert_eq(result, expected)


def test_to_datetime_errors_ignore_deprecated():
Expand Down
20 changes: 20 additions & 0 deletions python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,3 +596,23 @@ def test_parquet_read_filter_and_project(tmpdir):
# Check result
expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True)
dd.assert_eq(got, expected)


def test_timezone_column(tmpdir):
path = str(tmpdir.join("test.parquet"))
pdf = pd.DataFrame(
{
"time": pd.to_datetime(
["1996-01-02", "1996-12-01"],
utc=True,
),
"x": [1, 2],
}
)
pdf.to_parquet(path)
got = dask_cudf.read_parquet(path)
# cudf.read_parquet does not support reading timezone aware types yet
assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC")
got["time"] = got["time"].astype("datetime64[ns]")
expected = cudf.read_parquet(path)
dd.assert_eq(got, expected)
Loading