-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Implement DataFrame.from_pyarrow and DataFrame.to_pyarrow #51769
Changes from 3 commits
1d013db
f08f863
04b3897
a62e184
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -171,6 +171,10 @@ | |
PeriodArray, | ||
TimedeltaArray, | ||
) | ||
from pandas.core.arrays.arrow import ( | ||
ArrowDtype, | ||
ArrowExtensionArray, | ||
) | ||
from pandas.core.arrays.sparse import SparseFrameAccessor | ||
from pandas.core.construction import ( | ||
ensure_wrapped_if_datetimelike, | ||
|
@@ -1763,6 +1767,79 @@ def create_index(indexlist, namelist): | |
columns = create_index(data["columns"], data["column_names"]) | ||
return cls(realdata, index=index, columns=columns, dtype=dtype) | ||
|
||
@classmethod | ||
def from_pyarrow( | ||
cls, | ||
table, | ||
) -> DataFrame: | ||
""" | ||
Convert a pyarrow table to DataFrame. | ||
|
||
Parameters | ||
---------- | ||
table: pyarrow.Table | ||
|
||
Returns | ||
------- | ||
DataFrame | ||
|
||
See Also | ||
-------- | ||
DataFrame.to_pyarrow | ||
pyarrow.Table.to_pandas | ||
|
||
Notes | ||
----- | ||
The conversion is zero-copy, and the resulting DataFrame uses | ||
Arrow-backend data types. | ||
|
||
For customization of the conversion use the | ||
`to_pandas <https://arrow.apache.org/docs/python/generated/\ | ||
pyarrow.Table.html#pyarrow.Table.to_pandas>`__ | ||
method of the Table object. | ||
|
||
Examples | ||
-------- | ||
>>> import pyarrow as pa | ||
>>> table = pa.table([pa.array([1, 2, 3])], names=["my_column"]) | ||
>>> pd.DataFrame.from_pyarrow(table) | ||
my_column | ||
0 1 | ||
1 2 | ||
2 3 | ||
""" | ||
return table.to_pandas(types_mapper=ArrowDtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you should do an instance check here ; eg if this is called with a non arrow table you would get a hard to understand error |
||
|
||
def to_pyarrow( | ||
self, | ||
): | ||
""" | ||
Convert the DataFrame to a pyarrow.Table object. | ||
|
||
Returns | ||
------- | ||
pyarrow.Table | ||
|
||
See Also | ||
-------- | ||
DataFrame.from_pyarrow | ||
pyarrow.Table.from_pandas | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) | ||
>>> df.to_pyarrow() | ||
pyarrow.Table | ||
col1: int64 | ||
col2: int64 | ||
---- | ||
col1: [[1,2]] | ||
col2: [[3,4]] | ||
""" | ||
import pyarrow as pa | ||
|
||
return pa.Table.from_pandas(self) | ||
|
||
def to_numpy( | ||
self, | ||
dtype: npt.DTypeLike | None = None, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import pytest | ||
|
||
from pandas import ( | ||
NA, | ||
DataFrame, | ||
Series, | ||
) | ||
from pandas.core.arrays.arrow import ArrowDtype | ||
|
||
pa = pytest.importorskip("pyarrow", minversion="7.0.0") | ||
import pandas._testing as tm | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def sample_dataframe_numpy_backend(): | ||
return DataFrame( | ||
{ | ||
"u8": Series([1, 2, 3, NA], dtype="UInt8"), | ||
"f64": Series([float("NaN"), 1.0, 2.0, 3.0], dtype="float64"), | ||
"s": Series(["foo", "bar", None, "foobar"], dtype="object"), | ||
} | ||
) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def sample_dataframe_pyarrow_backend(): | ||
return DataFrame( | ||
{ | ||
"u8": Series([1, 2, 3, NA], dtype="uint8[pyarrow]"), | ||
"f64": Series([NA, 1.0, 2.0, 3.0], dtype="float64[pyarrow]"), | ||
"s": Series(["foo", "bar", NA, "foobar"], dtype="string[pyarrow]"), | ||
} | ||
) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def sample_pyarrow_table(): | ||
return pa.table( | ||
[ | ||
pa.array([1, 2, 3, None], type=pa.uint8()), | ||
pa.array([None, 1.0, 2.0, 3.0], type=pa.float64()), | ||
pa.array(["foo", "bar", None, "foobar"], type=pa.string()), | ||
], | ||
names=["u8", "f64", "s"], | ||
) | ||
|
||
|
||
class TestPyArrow: | ||
@pytest.mark.parametrize( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need a skip if pyarrow is not installed |
||
"column,dtype", [("u8", pa.uint8()), ("f64", pa.float64()), ("s", pa.string())] | ||
) | ||
def test_from_pyarrow_uses_right_pandas_types( | ||
self, sample_pyarrow_table, column, dtype | ||
): | ||
result = DataFrame.from_pyarrow(sample_pyarrow_table) | ||
assert result[column].dtype == ArrowDtype(dtype) | ||
|
||
@pytest.mark.parametrize("column", ["u8", "f64", "s"]) | ||
def test_from_pyarrow_keeps_correct_data(self, sample_pyarrow_table, column): | ||
result = DataFrame.from_pyarrow(sample_pyarrow_table) | ||
assert result[column]._data.array._data == sample_pyarrow_table[column] | ||
|
||
@pytest.mark.parametrize("column", ["u8", "f64", "s"]) | ||
def test_from_pyarrow_does_not_copy_memory(self, sample_pyarrow_table, column): | ||
result = DataFrame.from_pyarrow(sample_pyarrow_table) | ||
|
||
result_buffers = result[column]._data.array._data.chunks[0].buffers() | ||
expected_buffers = sample_pyarrow_table[column].chunks[0].buffers() | ||
|
||
for result_buffer, expected_buffer in zip(result_buffers, expected_buffers): | ||
if result_buffer is None and expected_buffer is None: | ||
continue | ||
assert result_buffer.address == expected_buffer.address | ||
assert result_buffer.size == expected_buffer.size | ||
|
||
def test_to_pyarrow_numpy_backend( | ||
self, sample_dataframe_numpy_backend, sample_pyarrow_table | ||
): | ||
result = sample_dataframe_numpy_backend.to_pyarrow() | ||
assert result == sample_pyarrow_table | ||
|
||
def test_to_pyarrow_pyarrow_backend( | ||
self, sample_dataframe_pyarrow_backend, sample_pyarrow_table | ||
): | ||
result = sample_dataframe_pyarrow_backend.to_pyarrow() | ||
assert result == sample_pyarrow_table | ||
|
||
def test_pyarrow_roundtrip( | ||
self, sample_dataframe_numpy_backend, sample_dataframe_pyarrow_backend | ||
): | ||
result = DataFrame.from_pyarrow(sample_dataframe_numpy_backend.to_pyarrow()) | ||
tm.assert_frame_equal(result, sample_dataframe_pyarrow_backend) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure that the term "zero-copy" is well understood, so maybe elaborate?