From 9922504e963f679a597c83e55622ad2bf4fb36f6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Mar 2023 19:37:08 +0100 Subject: [PATCH 1/4] CLN: Use type_mapper instead of manual conversion --- pandas/io/feather_format.py | 15 +++------------ pandas/io/orc.py | 11 ++--------- pandas/io/parquet.py | 13 ++++--------- 3 files changed, 9 insertions(+), 30 deletions(-) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 8a21d99124ec6..1a09157fabd09 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -18,10 +18,8 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc -from pandas import ( - arrays, - get_option, -) +import pandas as pd +from pandas import get_option from pandas.core.api import ( DataFrame, RangeIndex, @@ -173,11 +171,4 @@ def read_feather( return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) elif dtype_backend == "pyarrow": - return DataFrame( - { - col_name: arrays.ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + return pa_table.to_pandas(types_mapper=pd.ArrowDtype) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 1b9be9adc1196..e6b7037147763 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -29,7 +29,7 @@ is_unsigned_integer_dtype, ) -from pandas.core.arrays import ArrowExtensionArray +import pandas as pd from pandas.core.frame import DataFrame from pandas.io.common import ( @@ -124,14 +124,7 @@ def read_orc( if use_nullable_dtypes: dtype_backend = get_option("mode.dtype_backend") if dtype_backend == "pyarrow": - df = DataFrame( - { - col_name: ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) else: from pandas.io._util import _arrow_dtype_mapping diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index aec31f40f8570..87a06c6e7778c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -22,10 +22,10 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +import pandas as pd from pandas import ( DataFrame, MultiIndex, - arrays, get_option, ) from pandas.core.shared_docs import _shared_docs @@ -250,14 +250,9 @@ def read( if dtype_backend == "pandas": result = pa_table.to_pandas(**to_pandas_kwargs) elif dtype_backend == "pyarrow": - result = DataFrame( - { - col_name: arrays.ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + to_pandas_kwargs["types_mapper"] = pd.ArrowDtype + result = pa_table.to_pandas(**to_pandas_kwargs) + if manager == "array": result = result._as_manager("array", copy=False) return result From 398e5f249d0f5449b0681c2f6bc6be7fc8b3b9ba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Mar 2023 21:46:25 +0100 Subject: [PATCH 2/4] Fix mypy --- pandas/io/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 87a06c6e7778c..7dc839f47b186 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -250,7 +250,9 @@ def read( if dtype_backend == "pandas": result = pa_table.to_pandas(**to_pandas_kwargs) elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype + # Incompatible types in assignment (expression has type + # "Type[ArrowDtype]", target has type overloaded function + to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa result = pa_table.to_pandas(**to_pandas_kwargs) if manager == "array": From 911711505d3ef08d7a4c0520e816ae071254e62c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 4 Mar 2023 15:08:54 +0100 Subject: [PATCH 3/4] Add test --- pandas/tests/io/test_parquet.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 353dc4f1cbd8a..c0b4c5585ed6b 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1059,6 +1059,20 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full): expected=expected, ) + def test_read_use_nullable_types_pyarrow_config_index(self, pa): + df = pd.DataFrame( + {"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]" + ) + expected = df.copy() + + with pd.option_context("mode.dtype_backend", "pyarrow"): + check_round_trip( + df, + engine=pa, + read_kwargs={"use_nullable_dtypes": True}, + expected=expected, + ) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From 579c4832e43a8771692a73d4921ca43f902d2984 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Mar 2023 14:35:40 +0100 Subject: [PATCH 4/4] Add types_mapper in more places --- pandas/io/json/_json.py | 13 +++---------- pandas/io/parsers/arrow_parser_wrapper.py | 9 ++------- pandas/tests/io/test_parquet.py | 9 +-------- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 80d2f9eda7ce5..3f10e979fa86c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -54,6 +54,7 @@ from pandas.core.dtypes.generic import ABCIndex from pandas import ( + ArrowDtype, DataFrame, MultiIndex, Series, @@ -961,16 +962,8 @@ def read(self) -> DataFrame | Series: pa_table = pyarrow_json.read_json(self.data) if self.use_nullable_dtypes: if get_option("mode.dtype_backend") == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - return DataFrame( - { - col_name: ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + return pa_table.to_pandas(types_mapper=ArrowDtype) + elif get_option("mode.dtype_backend") == "pandas": from pandas.io._util import _arrow_dtype_mapping diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 420b6212f857a..58dfc95c1e5b6 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -5,9 +5,9 @@ from pandas.core.dtypes.inference import is_integer +import pandas as pd from pandas import ( DataFrame, - arrays, get_option, ) @@ -153,12 +153,7 @@ def read(self) -> DataFrame: self.kwds["use_nullable_dtypes"] and get_option("mode.dtype_backend") == "pyarrow" ): - frame = DataFrame( - { - col_name: arrays.ArrowExtensionArray(pa_col) - for col_name, pa_col in zip(table.column_names, table.itercolumns()) - } - ) + frame = table.to_pandas(types_mapper=pd.ArrowDtype) else: frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index c0b4c5585ed6b..2124787e8a80e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1034,14 +1034,7 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full): df["bool_with_none"] = [True, None, True] pa_table = pyarrow.Table.from_pandas(df) - expected = pd.DataFrame( - { - col_name: pd.arrays.ArrowExtensionArray(pa_column) - for col_name, pa_column in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + expected = pa_table.to_pandas(types_mapper=pd.ArrowDtype) # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") expected["datetime_with_nat"] = expected["datetime_with_nat"].astype(