Skip to content

Commit

Permalink
apacheGH-33727: [Python] array() errors if pandas categorical column …
Browse files Browse the repository at this point in the history
…has dictionary as string not object (apache#34289)

### Rationale for this change
Currently writing a pandas dataframe with categorical column of dtype `string[pyarrow]` fails. The reason for this is that when category with `string[pyarrow]` dtype is converted to an array in pyarrow it results in a `ChunkedArray,` not `Array`, and then `DictionaryArray.from_arrays`  fails.

### What changes are included in this PR?
`_handle_arrow_array_protocol` method in _array.pxi_ is updated so that in case of a `ChunkedArray` with one chunk, the result is a `pyarrow.Array` and not `pa.ChunkedArray.`

### Are these changes tested?
Yes. Tests are added to:

- python/pyarrow/tests/parquet/test_pandas.py
- python/pyarrow/tests/test_pandas.py
- python/pyarrow/tests/test_array.py

### Are there any user-facing changes?
No.
* Closes: apache#33727

Lead-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
2 people authored and ArgusLi committed May 15, 2023
1 parent 73763c5 commit 99a00ef
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 1 deletion.
2 changes: 2 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size):
if not isinstance(res, (Array, ChunkedArray)):
raise TypeError("The object's __arrow_array__ method does not "
"return a pyarrow Array or ChunkedArray.")
if isinstance(res, ChunkedArray) and res.num_chunks==1:
res = res.chunk(0)
return res


Expand Down
25 changes: 25 additions & 0 deletions python/pyarrow/tests/parquet/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from pyarrow.tests.parquet.common import (
parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
from pyarrow.util import guid
from pyarrow.vendored.version import Version

try:
import pyarrow.parquet as pq
Expand Down Expand Up @@ -556,6 +557,30 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset):
tm.assert_frame_equal(result, df)


@pytest.mark.pandas
def test_categories_with_string_pyarrow_dtype(tempdir):
# gh-33727: writing to parquet should not fail
if Version(pd.__version__) < Version("1.3.0"):
pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")

df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
df1 = df1.astype("category")

df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]})
df2 = df2.astype("category")

# categories should be converted to pa.Array
assert pa.array(df1["x"]) == pa.array(df2["x"])
assert pa.array(df1["x"].cat.categories.values) == pa.array(
df2["x"].cat.categories.values)

path = str(tempdir / 'cat.parquet')
pq.write_table(pa.table(df1), path)
result = pq.read_table(path).to_pandas()

tm.assert_frame_equal(result, df2)


@pytest.mark.pandas
@parametrize_legacy_dataset
def test_write_to_dataset_pandas_preserve_extensiondtypes(
Expand Down
17 changes: 16 additions & 1 deletion python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3283,6 +3283,7 @@ def __arrow_array__(self, type=None):
pa.array(arr)

# ARROW-7066 - allow ChunkedArray output
# GH-33727 - if num_chunks=1 return Array
class MyArray2:
def __init__(self, data):
self.data = data
Expand All @@ -3292,7 +3293,21 @@ def __arrow_array__(self, type=None):

arr = MyArray2(np.array([1, 2, 3], dtype='int64'))
result = pa.array(arr)
expected = pa.chunked_array([[1, 2, 3]], type=pa.int64())
expected = pa.array([1, 2, 3], type=pa.int64())
assert result.equals(expected)

class MyArray3:
def __init__(self, data1, data2):
self.data1 = data1
self.data2 = data2

def __arrow_array__(self, type=None):
return pa.chunked_array([self.data1, self.data2], type=type)

np_arr = np.array([1, 2, 3], dtype='int64')
arr = MyArray3(np_arr, np_arr)
result = pa.array(arr)
expected = pa.chunked_array([[1, 2, 3], [1, 2, 3]], type=pa.int64())
assert result.equals(expected)


Expand Down
16 changes: 16 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3924,6 +3924,22 @@ def test_dictionary_from_pandas_specified_type():
assert result.to_pylist() == ['a', 'b']


def test_convert_categories_to_array_with_string_pyarrow_dtype():
# gh-33727: categories should be converted to pa.Array
if Version(pd.__version__) < Version("1.3.0"):
pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")

df = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
df = df.astype("category")
indices = pa.array(df['x'].cat.codes)
dictionary = pa.array(df["x"].cat.categories.values)
assert isinstance(dictionary, pa.Array)

expected = pa.Array.from_pandas(df['x'])
result = pa.DictionaryArray.from_arrays(indices, dictionary)
assert result == expected


# ----------------------------------------------------------------------
# Array protocol in pandas conversions tests

Expand Down

0 comments on commit 99a00ef

Please sign in to comment.