Support timezone aware pandas inputs in cudf (#15935)

closes #13611 (This technically does not support pandas objects have interval types that are timezone aware) @rjzamora let me know if the test I adapted from your PR in #15929 is adequate Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #15935
rapidsai · Jun 10, 2024 · e3ba131 · e3ba131
1 parent 9b2c35f
commit e3ba131
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 49 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 "yet supported in pyarrow, see: "
                 "https://github.com/apache/arrow/issues/20213"
             )
-        elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
         elif pa.types.is_large_string(array.type):
@@ -992,9 +988,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             return col
         elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             return col.as_decimal_column(dtype)
-        elif np.issubdtype(cast(Any, dtype), np.datetime64):
+        elif dtype.kind == "M":
             return col.as_datetime_column(dtype)
-        elif np.issubdtype(cast(Any, dtype), np.timedelta64):
+        elif dtype.kind == "m":
             return col.as_timedelta_column(dtype)
         elif dtype.kind == "O":
             if cudf.get_option("mode.pandas_compatible") and was_object:
@@ -1846,21 +1842,11 @@ def as_column(
             and arbitrary.freq is not None
         ):
             raise NotImplementedError("freq is not implemented yet")
-        elif (
-            isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
-            or (
-                isinstance(arbitrary.dtype, pd.IntervalDtype)
-                and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
-            )
-            or (
-                isinstance(arbitrary.dtype, pd.CategoricalDtype)
-                and isinstance(
-                    arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
-                )
-            )
+        elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance(
+            arbitrary.dtype.subtype, pd.DatetimeTZDtype
         ):
             raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
+                "cuDF does not yet support Intervals with timezone-aware datetimes"
             )
         elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
             if cudf.get_option("mode.pandas_compatible"):
@@ -1876,7 +1862,8 @@ def as_column(
                 length=length,
             )
         elif isinstance(
-            arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
+            arbitrary.dtype,
+            (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype),
         ):
             return as_column(
                 pa.array(arbitrary, from_pandas=True),

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -1757,13 +1757,10 @@ def __init__(
         name = _getdefault_name(data, name=name)
         data = column.as_column(data)
 
-        # TODO: Remove this if statement and fix tests now that
-        # there's timezone support
-        if isinstance(data.dtype, pd.DatetimeTZDtype):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        data = data.astype(dtype)
+        # TODO: if data.dtype.kind == "M" (i.e. data is already datetime type)
+        # We probably shouldn't always astype to datetime64[ns]
+        if not isinstance(data.dtype, pd.DatetimeTZDtype):
+            data = data.astype(dtype)
 
         if copy:
             data = data.copy()

diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected):
 def test_tz_convert_naive_typeerror():
     with pytest.raises(TypeError):
         cudf.date_range("2020", periods=2, freq="D").tz_convert(None)
+
+
+@pytest.mark.parametrize(
+    "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
+)
+def test_from_pandas_obj_tz_aware(klass):
+    tz_aware_data = [
+        pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific")
+    ]
+    pandas_obj = getattr(pd, klass)(tz_aware_data)
+    result = cudf.from_pandas(pandas_obj)
+    expected = getattr(cudf, klass)(tz_aware_data)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
@@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype):
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize(
-    "data",
-    [
-        [pd.Timestamp("2001-01-01", tz="America/New_York")],
-        pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
-        pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
-    ],
-)
-def test_construction_from_tz_timestamps(data):
-    with pytest.raises(NotImplementedError):
-        _ = cudf.Series(data)
-    with pytest.raises(NotImplementedError):
-        _ = cudf.Index(data)
-    with pytest.raises(NotImplementedError):
-        _ = cudf.DatetimeIndex(data)
-    with pytest.raises(NotImplementedError):
-        cudf.CategoricalIndex(data)
-
-
 @pytest.mark.parametrize("op", _cmpops)
 def test_datetime_binop_tz_timestamp(op):
     s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
@@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr):
         t.dt.ceil(freqstr)
 
 
-def test_timezone_array_notimplemented():
+def test_timezone_pyarrow_array():
     pa_array = pa.array(
         [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)],
         type=pa.timestamp("ns", "UTC"),
     )
-    with pytest.raises(NotImplementedError):
-        cudf.Series(pa_array)
+    result = cudf.Series(pa_array)
+    expected = pa_array.to_pandas()
+    assert_eq(result, expected)
 
 
 def test_to_datetime_errors_ignore_deprecated():

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -596,3 +596,23 @@ def test_parquet_read_filter_and_project(tmpdir):
     # Check result
     expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True)
     dd.assert_eq(got, expected)
+
+
+def test_timezone_column(tmpdir):
+    path = str(tmpdir.join("test.parquet"))
+    pdf = pd.DataFrame(
+        {
+            "time": pd.to_datetime(
+                ["1996-01-02", "1996-12-01"],
+                utc=True,
+            ),
+            "x": [1, 2],
+        }
+    )
+    pdf.to_parquet(path)
+    got = dask_cudf.read_parquet(path)
+    # cudf.read_parquet does not support reading timezone aware types yet
+    assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC")
+    got["time"] = got["time"].astype("datetime64[ns]")
+    expected = cudf.read_parquet(path)
+    dd.assert_eq(got, expected)