diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3a6d3cc2141be..4cb3a7b59a814 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -281,6 +281,7 @@ Other enhancements - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) - :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`) +- Added methods :meth:`DataFrame.from_pyarrow` and :meth:`DataFrame.to_pyarrow` to convert data from and to PyArrow tables (:issue:`51760`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8cd0ffadcc17c..12b1941c2309d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -171,6 +171,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.arrow import ArrowDtype from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -1763,6 +1764,79 @@ def create_index(indexlist, namelist): columns = create_index(data["columns"], data["column_names"]) return cls(realdata, index=index, columns=columns, dtype=dtype) + @classmethod + def from_pyarrow( + cls, + table, + ) -> DataFrame: + """ + Convert a pyarrow table to DataFrame. + + Parameters + ---------- + table: pyarrow.Table + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.to_pyarrow + pyarrow.Table.to_pandas + + Notes + ----- + The conversion is zero-copy, and the resulting DataFrame uses + Arrow-backend data types. + + For customization of the conversion use the + `to_pandas `__ + method of the Table object. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table([pa.array([1, 2, 3])], names=["my_column"]) + >>> pd.DataFrame.from_pyarrow(table) + my_column + 0 1 + 1 2 + 2 3 + """ + return table.to_pandas(types_mapper=ArrowDtype) + + def to_pyarrow( + self, + ): + """ + Convert the DataFrame to a pyarrow.Table object. + + Returns + ------- + pyarrow.Table + + See Also + -------- + DataFrame.from_pyarrow + pyarrow.Table.from_pandas + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_pyarrow() + pyarrow.Table + col1: int64 + col2: int64 + ---- + col1: [[1,2]] + col2: [[3,4]] + """ + import pyarrow as pa + + return pa.Table.from_pandas(self) + def to_numpy( self, dtype: npt.DTypeLike | None = None, diff --git a/pandas/tests/frame/constructors/test_pyarrow.py b/pandas/tests/frame/constructors/test_pyarrow.py new file mode 100644 index 0000000000000..50051b7c48935 --- /dev/null +++ b/pandas/tests/frame/constructors/test_pyarrow.py @@ -0,0 +1,92 @@ +import pytest + +from pandas import ( + NA, + DataFrame, + Series, +) +from pandas.core.arrays.arrow import ArrowDtype + +pa = pytest.importorskip("pyarrow", minversion="7.0.0") +import pandas._testing as tm + + +@pytest.fixture(scope="module") +def sample_dataframe_numpy_backend(): + return DataFrame( + { + "u8": Series([1, 2, 3, NA], dtype="UInt8"), + "f64": Series([float("NaN"), 1.0, 2.0, 3.0], dtype="float64"), + "s": Series(["foo", "bar", None, "foobar"], dtype="object"), + } + ) + + +@pytest.fixture(scope="module") +def sample_dataframe_pyarrow_backend(): + return DataFrame( + { + "u8": Series([1, 2, 3, NA], dtype="uint8[pyarrow]"), + "f64": Series([NA, 1.0, 2.0, 3.0], dtype="float64[pyarrow]"), + "s": Series(["foo", "bar", NA, "foobar"], dtype="string[pyarrow]"), + } + ) + + +@pytest.fixture(scope="module") +def sample_pyarrow_table(): + return pa.table( + [ + pa.array([1, 2, 3, None], type=pa.uint8()), + pa.array([None, 1.0, 2.0, 3.0], type=pa.float64()), + pa.array(["foo", "bar", None, "foobar"], type=pa.string()), + ], + names=["u8", "f64", "s"], + ) + + +class TestPyArrow: + @pytest.mark.parametrize( + "column,dtype", [("u8", pa.uint8()), ("f64", pa.float64()), ("s", pa.string())] + ) + def test_from_pyarrow_uses_right_pandas_types( + self, sample_pyarrow_table, column, dtype + ): + result = DataFrame.from_pyarrow(sample_pyarrow_table) + assert result[column].dtype == ArrowDtype(dtype) + + @pytest.mark.parametrize("column", ["u8", "f64", "s"]) + def test_from_pyarrow_keeps_correct_data(self, sample_pyarrow_table, column): + result = DataFrame.from_pyarrow(sample_pyarrow_table) + assert result[column]._data.array._data == sample_pyarrow_table[column] + + @pytest.mark.parametrize("column", ["u8", "f64", "s"]) + def test_from_pyarrow_does_not_copy_memory(self, sample_pyarrow_table, column): + result = DataFrame.from_pyarrow(sample_pyarrow_table) + + result_buffers = result[column]._data.array._data.chunks[0].buffers() + expected_buffers = sample_pyarrow_table[column].chunks[0].buffers() + + for result_buffer, expected_buffer in zip(result_buffers, expected_buffers): + if result_buffer is None and expected_buffer is None: + continue + assert result_buffer.address == expected_buffer.address + assert result_buffer.size == expected_buffer.size + + def test_to_pyarrow_numpy_backend( + self, sample_dataframe_numpy_backend, sample_pyarrow_table + ): + result = sample_dataframe_numpy_backend.to_pyarrow() + assert result == sample_pyarrow_table + + def test_to_pyarrow_pyarrow_backend( + self, sample_dataframe_pyarrow_backend, sample_pyarrow_table + ): + result = sample_dataframe_pyarrow_backend.to_pyarrow() + assert result == sample_pyarrow_table + + def test_pyarrow_roundtrip( + self, sample_dataframe_numpy_backend, sample_dataframe_pyarrow_backend + ): + result = DataFrame.from_pyarrow(sample_dataframe_numpy_backend.to_pyarrow()) + tm.assert_frame_equal(result, sample_dataframe_pyarrow_backend)