Skip to content

Commit

Permalink
GH-35081: [Python] construct pandas.DataFrame with public API in `to_…
Browse files Browse the repository at this point in the history
…pandas` (#40897)

### Rationale for this change

Avoiding using pandas internals to create Block objects ourselves, using a new API for pandas>=3

* GitHub Issue: #35081

Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
jorisvandenbossche authored and raulcd committed Apr 16, 2024
1 parent b7187ad commit 5a146ed
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 34 deletions.
7 changes: 6 additions & 1 deletion python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype, _lock
bint has_sparse
bint _pd024
bint _is_v1, _is_ge_v21
bint _is_v1, _is_ge_v21, _is_ge_v3

def __init__(self):
self._lock = Lock()
Expand Down Expand Up @@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object):

self._is_v1 = self._loose_version < Version('2.0.0')
self._is_ge_v21 = self._loose_version >= Version('2.1.0')
self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')

self._compat_module = pdcompat
self._data_frame = pd.DataFrame
Expand Down Expand Up @@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._is_ge_v21

def is_ge_v3(self):
self._check_import()
return self._is_ge_v3

@property
def categorical_type(self):
self._check_import()
Expand Down
75 changes: 42 additions & 33 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
# Converting pyarrow.Table efficiently to pandas.DataFrame


def _reconstruct_block(item, columns=None, extension_columns=None):
def _reconstruct_block(item, columns=None, extension_columns=None, return_block=True):
"""
Construct a pandas Block from the `item` dictionary coming from pyarrow's
serialization or returned by arrow::python::ConvertTableToPandas.
Expand Down Expand Up @@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
block_arr = item.get('block', None)
placement = item['placement']
if 'dictionary' in item:
cat = _pandas_api.categorical_type.from_codes(
arr = _pandas_api.categorical_type.from_codes(
block_arr, categories=item['dictionary'],
ordered=item['ordered'])
block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
unit, _ = np.datetime_data(block_arr.dtype)
dtype = make_datetimetz(unit, item['timezone'])
if _pandas_api.is_ge_v21():
pd_arr = _pandas_api.pd.array(
arr = _pandas_api.pd.array(
block_arr.view("int64"), dtype=dtype, copy=False
)
block = _int.make_block(pd_arr, placement=placement)
else:
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
arr = block_arr
if return_block:
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
return block
elif 'py_array' in item:
# create ExtensionBlock
arr = item['py_array']
Expand All @@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
if not hasattr(pandas_dtype, '__from_arrow__'):
raise ValueError("This column does not support to be converted "
"to a pandas ExtensionArray")
pd_ext_arr = pandas_dtype.__from_arrow__(arr)
block = _int.make_block(pd_ext_arr, placement=placement)
arr = pandas_dtype.__from_arrow__(arr)
else:
block = _int.make_block(block_arr, placement=placement)
arr = block_arr

return block
if return_block:
return _int.make_block(arr, placement=placement)
else:
return arr, placement


def make_datetimetz(unit, tz):
Expand All @@ -752,9 +755,6 @@ def make_datetimetz(unit, tz):
def table_to_dataframe(
options, table, categories=None, ignore_metadata=False, types_mapper=None
):
from pandas.core.internals import BlockManager
from pandas import DataFrame

all_columns = []
column_indexes = []
pandas_metadata = table.schema.pandas_metadata
Expand All @@ -774,15 +774,35 @@ def table_to_dataframe(

_check_data_column_metadata_consistency(all_columns)
columns = _deserialize_column_index(table, all_columns, column_indexes)
blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)

axes = [columns, index]
mgr = BlockManager(blocks, axes)
if _pandas_api.is_ge_v21():
df = DataFrame._from_mgr(mgr, mgr.axes)
column_names = table.column_names
result = pa.lib.table_to_blocks(options, table, categories,
list(ext_columns_dtypes.keys()))
if _pandas_api.is_ge_v3():
from pandas.api.internals import create_dataframe_from_blocks

blocks = [
_reconstruct_block(
item, column_names, ext_columns_dtypes, return_block=False)
for item in result
]
df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
return df
else:
df = DataFrame(mgr)
return df
from pandas.core.internals import BlockManager
from pandas import DataFrame

blocks = [
_reconstruct_block(item, column_names, ext_columns_dtypes)
for item in result
]
axes = [columns, index]
mgr = BlockManager(blocks, axes)
if _pandas_api.is_ge_v21():
df = DataFrame._from_mgr(mgr, mgr.axes)
else:
df = DataFrame(mgr)
return df


# Set of the string repr of all numpy dtypes that can be stored in a pandas
Expand Down Expand Up @@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
return pd.Index(new_levels[0], dtype=new_levels[0].dtype, name=columns.name)


def _table_to_blocks(options, block_table, categories, extension_columns):
# Part of table_to_blockmanager

# Convert an arrow table to Block from the internal pandas API
columns = block_table.column_names
result = pa.lib.table_to_blocks(options, block_table, categories,
list(extension_columns.keys()))
return [_reconstruct_block(item, columns, extension_columns)
for item in result]


def _add_any_metadata(table, pandas_metadata):
modified_columns = {}
modified_fields = {}
Expand Down

0 comments on commit 5a146ed

Please sign in to comment.