Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Use type_mapper instead of manual conversion #51766

Merged
merged 6 commits into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc

from pandas import (
arrays,
get_option,
)
import pandas as pd
from pandas import get_option
from pandas.core.api import DataFrame
from pandas.core.shared_docs import _shared_docs

Expand Down Expand Up @@ -140,11 +138,4 @@ def read_feather(
return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)

elif dtype_backend == "pyarrow":
return DataFrame(
{
col_name: arrays.ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
13 changes: 3 additions & 10 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from pandas.core.dtypes.generic import ABCIndex

from pandas import (
ArrowDtype,
DataFrame,
MultiIndex,
Series,
Expand Down Expand Up @@ -963,16 +964,8 @@ def read(self) -> DataFrame | Series:
pa_table = pyarrow_json.read_json(self.data)
if self.use_nullable_dtypes:
if get_option("mode.dtype_backend") == "pyarrow":
from pandas.arrays import ArrowExtensionArray

return DataFrame(
{
col_name: ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
return pa_table.to_pandas(types_mapper=ArrowDtype)

elif get_option("mode.dtype_backend") == "pandas":
from pandas.io._util import _arrow_dtype_mapping

Expand Down
14 changes: 4 additions & 10 deletions pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@
is_unsigned_integer_dtype,
)

from pandas.core.arrays import ArrowExtensionArray
from pandas.core.frame import DataFrame
import pandas as pd

from pandas.io.common import (
get_handle,
Expand All @@ -40,6 +39,8 @@
WriteBuffer,
)

from pandas.core.frame import DataFrame


def read_orc(
path: FilePath | ReadBuffer[bytes],
Expand Down Expand Up @@ -127,14 +128,7 @@ def read_orc(
if use_nullable_dtypes:
dtype_backend = get_option("mode.dtype_backend")
if dtype_backend == "pyarrow":
df = DataFrame(
{
col_name: ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
else:
from pandas.io._util import _arrow_dtype_mapping

Expand Down
15 changes: 6 additions & 9 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc

import pandas as pd
from pandas import (
DataFrame,
MultiIndex,
arrays,
get_option,
)
from pandas.core.shared_docs import _shared_docs
Expand Down Expand Up @@ -252,14 +252,11 @@ def read(
if dtype_backend == "pandas":
result = pa_table.to_pandas(**to_pandas_kwargs)
elif dtype_backend == "pyarrow":
result = DataFrame(
{
col_name: arrays.ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
# Incompatible types in assignment (expression has type
# "Type[ArrowDtype]", target has type overloaded function
to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa
result = pa_table.to_pandas(**to_pandas_kwargs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should be able to move the if pandas: .. elif pyarrow: .. higher up where to_pandas_kwargs["types_mapper"] = ... is defined. And then just have a single result = pa_table.to_pandas(**to_pandas_kwargs) here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoided this on purpose, because this would chang semantics. The other if is only reached if nullable_dtypes is set as well and I wanted to wait for the other discussion before touching anything,


if manager == "array":
result = result._as_manager("array", copy=False)
return result
Expand Down
9 changes: 2 additions & 7 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

from pandas.core.dtypes.inference import is_integer

import pandas as pd
from pandas import (
DataFrame,
arrays,
get_option,
)

Expand Down Expand Up @@ -156,12 +156,7 @@ def read(self) -> DataFrame:
self.kwds["use_nullable_dtypes"]
and get_option("mode.dtype_backend") == "pyarrow"
):
frame = DataFrame(
{
col_name: arrays.ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(table.column_names, table.itercolumns())
}
)
frame = table.to_pandas(types_mapper=pd.ArrowDtype)
else:
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
23 changes: 15 additions & 8 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,14 +1034,7 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full):
df["bool_with_none"] = [True, None, True]

pa_table = pyarrow.Table.from_pandas(df)
expected = pd.DataFrame(
{
col_name: pd.arrays.ArrowExtensionArray(pa_column)
for col_name, pa_column in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
expected = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
# pyarrow infers datetimes as us instead of ns
expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]")
expected["datetime_with_nat"] = expected["datetime_with_nat"].astype(
Expand All @@ -1059,6 +1052,20 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full):
expected=expected,
)

def test_read_use_nullable_types_pyarrow_config_index(self, pa):
df = pd.DataFrame(
{"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]"
)
expected = df.copy()

with pd.option_context("mode.dtype_backend", "pyarrow"):
check_round_trip(
df,
engine=pa,
read_kwargs={"use_nullable_dtypes": True},
expected=expected,
)


class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full):
Expand Down