From d9331f23c287162e8941b8b5af87cd8b2744a193 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 25 May 2023 12:22:04 -0400 Subject: [PATCH] Backport PR #53382: BUG: convert_dtypes(dtype_backend="pyarrow") losing tz for tz-aware dtypes --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/array.py | 3 +++ pandas/core/dtypes/cast.py | 4 +++- pandas/tests/frame/methods/test_convert_dtypes.py | 13 ++++++++++++- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 1ebb512776556..b23f92cc51f1c 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -33,6 +33,7 @@ Bug fixes - Bug in :func:`read_csv` raising ``OverflowError`` for ``engine="pyarrow"`` and ``parse_dates`` set (:issue:`53295`) - Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`DataFrame.convert_dtypes` losing timezone for tz-aware dtypes and ``dtype_backend="pyarrow"`` (:issue:`53382`) - Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 611ef142a72a5..445ec36135d5f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -53,6 +53,7 @@ is_object_dtype, is_scalar, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core import roperator @@ -168,6 +169,8 @@ def to_pyarrow_type( return dtype.pyarrow_dtype elif isinstance(dtype, pa.DataType): return dtype + elif isinstance(dtype, DatetimeTZDtype): + return pa.timestamp(dtype.unit, dtype.tz) elif dtype: try: # Accepts python types too diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 156c7c67c7011..2dbd9465be3c6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1134,7 +1134,9 @@ def convert_dtypes( and not isinstance(inferred_dtype, StringDtype) ) ): - if isinstance(inferred_dtype, PandasExtensionDtype): + if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance( + inferred_dtype, DatetimeTZDtype + ): base_dtype = inferred_dtype.base elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): base_dtype = inferred_dtype.numpy_dtype diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index a749cd11df4f7..2adee158379bb 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -53,7 +53,8 @@ def test_pyarrow_dtype_backend(self): "c": pd.Series([True, False, None], dtype=np.dtype("O")), "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), "e": pd.Series(pd.date_range("2022", periods=3)), - "f": pd.Series(pd.timedelta_range("1D", periods=3)), + "f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")), + "g": pd.Series(pd.timedelta_range("1D", periods=3)), } ) result = df.convert_dtypes(dtype_backend="pyarrow") @@ -76,6 +77,16 @@ def test_pyarrow_dtype_backend(self): ) ), "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="s", tz="UTC"), + ) + ), + "g": pd.arrays.ArrowExtensionArray( pa.array( [ datetime.timedelta(1),