From 1c986d6213904fd7d9acc5622dc91d029d3f1218 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 20 Nov 2024 23:52:11 -0800 Subject: [PATCH] ENH: expose `to_pandas_kwargs` in `read_parquet` with pyarrow backend (#59654) Co-authored-by: Joseph Kleinhenz Co-authored-by: Xiao Yuan Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/_util.py | 5 ++++- pandas/io/parquet.py | 22 ++++++++++++++++++++-- pandas/tests/io/test_parquet.py | 14 ++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5f7aed8ed9786..fbf2bed550c85 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,6 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 21203ad036fc6..9778a404e23e0 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -60,9 +60,12 @@ def arrow_table_to_pandas( table: pyarrow.Table, dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs + types_mapper: type[pd.ArrowDtype] | None | Callable if dtype_backend == "numpy_nullable": mapping = _arrow_dtype_mapping() @@ -80,5 +83,5 @@ def arrow_table_to_pandas( else: raise NotImplementedError - df = table.to_pandas(types_mapper=types_mapper) + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) return df diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 116f228faca93..6a5a83088e986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -242,6 +242,7 @@ def read( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True @@ -266,7 +267,11 @@ def read( "make_block is deprecated", DeprecationWarning, ) - result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -347,6 +352,7 @@ def read( filters=None, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} @@ -362,6 +368,10 @@ def read( raise NotImplementedError( "filesystem is not implemented for the fastparquet engine." ) + if to_pandas_kwargs is not None: + raise NotImplementedError( + "to_pandas_kwargs is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -452,7 +462,7 @@ def to_parquet( .. versionadded:: 2.1.0 kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- @@ -491,6 +501,7 @@ def read_parquet( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, filters: list[tuple] | list[list[tuple]] | None = None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: """ @@ -564,6 +575,12 @@ def read_parquet( .. versionadded:: 2.1.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.0.0 + **kwargs Any additional kwargs are passed to the engine. @@ -636,5 +653,6 @@ def read_parquet( storage_options=storage_options, dtype_backend=dtype_backend, filesystem=filesystem, + to_pandas_kwargs=to_pandas_kwargs, **kwargs, ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 31cdb6626d237..7919bb956dc7a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1172,6 +1172,20 @@ def test_non_nanosecond_timestamps(self, temp_file): ) tm.assert_frame_equal(result, expected) + def test_maps_as_pydicts(self, pa): + pyarrow = pytest.importorskip("pyarrow", "13.0.0") + + schema = pyarrow.schema( + [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] + ) + df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) + check_round_trip( + df, + pa, + write_kwargs={"schema": schema}, + read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, + ) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full, request):