Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Implement DataFrame.from_pyarrow and DataFrame.to_pyarrow #51769

Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ Other enhancements
- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
- Added support for SQLAlchemy 2.0 (:issue:`40686`)
- :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`)
- Added methods :meth:`DataFrame.from_pyarrow` and :meth:`DataFrame.to_pyarrow` to convert data from and to PyArrow tables (:issue:`51760`)

.. ---------------------------------------------------------------------------
.. _whatsnew_200.notable_bug_fixes:
Expand Down
77 changes: 77 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.arrow import (
ArrowDtype,
ArrowExtensionArray,
)
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
Expand Down Expand Up @@ -1763,6 +1767,79 @@ def create_index(indexlist, namelist):
columns = create_index(data["columns"], data["column_names"])
return cls(realdata, index=index, columns=columns, dtype=dtype)

@classmethod
def from_pyarrow(
cls,
table,
) -> DataFrame:
"""
Convert a pyarrow table to DataFrame.

Parameters
----------
table: pyarrow.Table

Returns
-------
DataFrame

See Also
--------
DataFrame.to_pyarrow
pyarrow.Table.to_pandas

Notes
-----
The conversion is zero-copy, and the resulting DataFrame uses
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure that the term "zero-copy" is well understood, so maybe elaborate?

Arrow-backend data types.

For customization of the conversion use the
`to_pandas <https://arrow.apache.org/docs/python/generated/\
pyarrow.Table.html#pyarrow.Table.to_pandas>`__
method of the Table object.

Examples
--------
>>> import pyarrow as pa
>>> table = pa.table([pa.array([1, 2, 3])], names=["my_column"])
>>> pd.DataFrame.from_pyarrow(table)
my_column
0 1
1 2
2 3
"""
return table.to_pandas(types_mapper=ArrowDtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should do an instance check here ; eg if this is called with a non arrow table you would get a hard to understand error


def to_pyarrow(
self,
):
"""
Convert the DataFrame to a pyarrow.Table object.

Returns
-------
pyarrow.Table

See Also
--------
DataFrame.from_pyarrow
pyarrow.Table.from_pandas

Examples
--------
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.to_pyarrow()
pyarrow.Table
col1: int64
col2: int64
----
col1: [[1,2]]
col2: [[3,4]]
"""
import pyarrow as pa

return pa.Table.from_pandas(self)

def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
Expand Down
92 changes: 92 additions & 0 deletions pandas/tests/frame/constructors/test_pyarrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import pytest

from pandas import (
NA,
DataFrame,
Series,
)
from pandas.core.arrays.arrow import ArrowDtype

pa = pytest.importorskip("pyarrow", minversion="7.0.0")
import pandas._testing as tm


@pytest.fixture(scope="module")
def sample_dataframe_numpy_backend():
return DataFrame(
{
"u8": Series([1, 2, 3, NA], dtype="UInt8"),
"f64": Series([float("NaN"), 1.0, 2.0, 3.0], dtype="float64"),
"s": Series(["foo", "bar", None, "foobar"], dtype="object"),
}
)


@pytest.fixture(scope="module")
def sample_dataframe_pyarrow_backend():
return DataFrame(
{
"u8": Series([1, 2, 3, NA], dtype="uint8[pyarrow]"),
"f64": Series([NA, 1.0, 2.0, 3.0], dtype="float64[pyarrow]"),
"s": Series(["foo", "bar", NA, "foobar"], dtype="string[pyarrow]"),
}
)


@pytest.fixture(scope="module")
def sample_pyarrow_table():
return pa.table(
[
pa.array([1, 2, 3, None], type=pa.uint8()),
pa.array([None, 1.0, 2.0, 3.0], type=pa.float64()),
pa.array(["foo", "bar", None, "foobar"], type=pa.string()),
],
names=["u8", "f64", "s"],
)


class TestPyArrow:
@pytest.mark.parametrize(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need a skip if pyarrow is not installed

"column,dtype", [("u8", pa.uint8()), ("f64", pa.float64()), ("s", pa.string())]
)
def test_from_pyarrow_uses_right_pandas_types(
self, sample_pyarrow_table, column, dtype
):
result = DataFrame.from_pyarrow(sample_pyarrow_table)
assert result[column].dtype == ArrowDtype(dtype)

@pytest.mark.parametrize("column", ["u8", "f64", "s"])
def test_from_pyarrow_keeps_correct_data(self, sample_pyarrow_table, column):
result = DataFrame.from_pyarrow(sample_pyarrow_table)
assert result[column]._data.array._data == sample_pyarrow_table[column]

@pytest.mark.parametrize("column", ["u8", "f64", "s"])
def test_from_pyarrow_does_not_copy_memory(self, sample_pyarrow_table, column):
result = DataFrame.from_pyarrow(sample_pyarrow_table)

result_buffers = result[column]._data.array._data.chunks[0].buffers()
expected_buffers = sample_pyarrow_table[column].chunks[0].buffers()

for result_buffer, expected_buffer in zip(result_buffers, expected_buffers):
if result_buffer is None and expected_buffer is None:
continue
assert result_buffer.address == expected_buffer.address
assert result_buffer.size == expected_buffer.size

def test_to_pyarrow_numpy_backend(
self, sample_dataframe_numpy_backend, sample_pyarrow_table
):
result = sample_dataframe_numpy_backend.to_pyarrow()
assert result == sample_pyarrow_table

def test_to_pyarrow_pyarrow_backend(
self, sample_dataframe_pyarrow_backend, sample_pyarrow_table
):
result = sample_dataframe_pyarrow_backend.to_pyarrow()
assert result == sample_pyarrow_table

def test_pyarrow_roundtrip(
self, sample_dataframe_numpy_backend, sample_dataframe_pyarrow_backend
):
result = DataFrame.from_pyarrow(sample_dataframe_numpy_backend.to_pyarrow())
tm.assert_frame_equal(result, sample_dataframe_pyarrow_backend)
1 change: 1 addition & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays.arrow import ArrowDtype

MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
MIXED_INT_DTYPES = [
Expand Down