From d15041b9bb0498bb70fdf5a26ca5bdd80f6519ba Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 25 May 2023 12:22:04 -0400 Subject: [PATCH] BUG: convert_dtypes(dtype_backend="pyarrow") losing tz for tz-aware dtypes (#53382) * BUG: convert_dtypes(dtype_backend="pyarrow") losing tz for tz-aware dtypes * whatsnew --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/array.py | 3 +++ pandas/core/dtypes/cast.py | 4 +++- pandas/tests/frame/methods/test_convert_dtypes.py | 13 ++++++++++++- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index cec201db7e216..33139f0c1bacf 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -35,6 +35,7 @@ Bug fixes - Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`DataFrame.convert_dtypes` losing timezone for tz-aware dtypes and ``dtype_backend="pyarrow"`` (:issue:`53382`) - Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 83cc39591c87e..5a9e4a97eccea 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -39,6 +39,7 @@ is_object_dtype, is_scalar, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core import roperator @@ -170,6 +171,8 @@ def to_pyarrow_type( return dtype.pyarrow_dtype elif isinstance(dtype, pa.DataType): return dtype + elif isinstance(dtype, DatetimeTZDtype): + return pa.timestamp(dtype.unit, dtype.tz) elif dtype: try: # Accepts python types too diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c863e5bb4dbd4..d7e2631e92960 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1097,7 +1097,9 @@ def convert_dtypes( and not isinstance(inferred_dtype, StringDtype) ) ): - if isinstance(inferred_dtype, PandasExtensionDtype): + if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance( + inferred_dtype, DatetimeTZDtype + ): base_dtype = inferred_dtype.base elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): base_dtype = inferred_dtype.numpy_dtype diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index a749cd11df4f7..2adee158379bb 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -53,7 +53,8 @@ def test_pyarrow_dtype_backend(self): "c": pd.Series([True, False, None], dtype=np.dtype("O")), "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), "e": pd.Series(pd.date_range("2022", periods=3)), - "f": pd.Series(pd.timedelta_range("1D", periods=3)), + "f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")), + "g": pd.Series(pd.timedelta_range("1D", periods=3)), } ) result = df.convert_dtypes(dtype_backend="pyarrow") @@ -76,6 +77,16 @@ def test_pyarrow_dtype_backend(self): ) ), "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="s", tz="UTC"), + ) + ), + "g": pd.arrays.ArrowExtensionArray( pa.array( [ datetime.timedelta(1),