Skip to content

Commit

Permalink
ENH: expose to_pandas_kwargs in read_parquet with pyarrow backend (
Browse files Browse the repository at this point in the history
…#59654)

Co-authored-by: Joseph Kleinhenz <[email protected]>
Co-authored-by: Xiao Yuan <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
4 people authored Nov 21, 2024
1 parent 72ab3fd commit 1c986d6
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,12 @@ def arrow_table_to_pandas(
table: pyarrow.Table,
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
null_to_int64: bool = False,
to_pandas_kwargs: dict | None = None,
) -> pd.DataFrame:
pa = import_optional_dependency("pyarrow")

to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs

types_mapper: type[pd.ArrowDtype] | None | Callable
if dtype_backend == "numpy_nullable":
mapping = _arrow_dtype_mapping()
Expand All @@ -80,5 +83,5 @@ def arrow_table_to_pandas(
else:
raise NotImplementedError

df = table.to_pandas(types_mapper=types_mapper)
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
return df
22 changes: 20 additions & 2 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def read(
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
storage_options: StorageOptions | None = None,
filesystem=None,
to_pandas_kwargs: dict[str, Any] | None = None,
**kwargs,
) -> DataFrame:
kwargs["use_pandas_metadata"] = True
Expand All @@ -266,7 +267,11 @@ def read(
"make_block is deprecated",
DeprecationWarning,
)
result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
result = arrow_table_to_pandas(
pa_table,
dtype_backend=dtype_backend,
to_pandas_kwargs=to_pandas_kwargs,
)

if pa_table.schema.metadata:
if b"PANDAS_ATTRS" in pa_table.schema.metadata:
Expand Down Expand Up @@ -347,6 +352,7 @@ def read(
filters=None,
storage_options: StorageOptions | None = None,
filesystem=None,
to_pandas_kwargs: dict | None = None,
**kwargs,
) -> DataFrame:
parquet_kwargs: dict[str, Any] = {}
Expand All @@ -362,6 +368,10 @@ def read(
raise NotImplementedError(
"filesystem is not implemented for the fastparquet engine."
)
if to_pandas_kwargs is not None:
raise NotImplementedError(
"to_pandas_kwargs is not implemented for the fastparquet engine."
)
path = stringify_path(path)
handles = None
if is_fsspec_url(path):
Expand Down Expand Up @@ -452,7 +462,7 @@ def to_parquet(
.. versionadded:: 2.1.0
kwargs
Additional keyword arguments passed to the engine
Additional keyword arguments passed to the engine.
Returns
-------
Expand Down Expand Up @@ -491,6 +501,7 @@ def read_parquet(
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
filesystem: Any = None,
filters: list[tuple] | list[list[tuple]] | None = None,
to_pandas_kwargs: dict | None = None,
**kwargs,
) -> DataFrame:
"""
Expand Down Expand Up @@ -564,6 +575,12 @@ def read_parquet(
.. versionadded:: 2.1.0
to_pandas_kwargs : dict | None, default None
Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas`
when ``engine="pyarrow"``.
.. versionadded:: 3.0.0
**kwargs
Any additional kwargs are passed to the engine.
Expand Down Expand Up @@ -636,5 +653,6 @@ def read_parquet(
storage_options=storage_options,
dtype_backend=dtype_backend,
filesystem=filesystem,
to_pandas_kwargs=to_pandas_kwargs,
**kwargs,
)
14 changes: 14 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,20 @@ def test_non_nanosecond_timestamps(self, temp_file):
)
tm.assert_frame_equal(result, expected)

def test_maps_as_pydicts(self, pa):
pyarrow = pytest.importorskip("pyarrow", "13.0.0")

schema = pyarrow.schema(
[("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))]
)
df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}])
check_round_trip(
df,
pa,
write_kwargs={"schema": schema},
read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}},
)


class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full, request):
Expand Down

0 comments on commit 1c986d6

Please sign in to comment.