GH-33727: [Python] array() errors if pandas categorical column has di…

…ctionary as string not object (#34289) ### Rationale for this change Currently writing a pandas dataframe with categorical column of dtype `string[pyarrow]` fails. The reason for this is that when category with `string[pyarrow]` dtype is converted to an array in pyarrow it results in a `ChunkedArray,` not `Array`, and then `DictionaryArray.from_arrays` fails. ### What changes are included in this PR? `_handle_arrow_array_protocol` method in _array.pxi_ is updated so that in case of a `ChunkedArray` with one chunk, the result is a `pyarrow.Array` and not `pa.ChunkedArray.` ### Are these changes tested? Yes. Tests are added to: - python/pyarrow/tests/parquet/test_pandas.py - python/pyarrow/tests/test_pandas.py - python/pyarrow/tests/test_array.py ### Are there any user-facing changes? No. * Closes: #33727 Lead-authored-by: Alenka Frim <[email protected]> Co-authored-by: Alenka Frim <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
apache · Mar 28, 2023 · ca18e6f · ca18e6f
1 parent 2dbd39c
commit ca18e6f
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 1 deletion.
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -111,6 +111,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size):
     if not isinstance(res, (Array, ChunkedArray)):
         raise TypeError("The object's __arrow_array__ method does not "
                         "return a pyarrow Array or ChunkedArray.")
+    if isinstance(res, ChunkedArray) and res.num_chunks==1:
+        res = res.chunk(0)
     return res
 
 

diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -26,6 +26,7 @@
 from pyarrow.tests.parquet.common import (
     parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
 from pyarrow.util import guid
+from pyarrow.vendored.version import Version
 
 try:
     import pyarrow.parquet as pq
@@ -556,6 +557,30 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset):
     tm.assert_frame_equal(result, df)
 
 
+@pytest.mark.pandas
+def test_categories_with_string_pyarrow_dtype(tempdir):
+    # gh-33727: writing to parquet should not fail
+    if Version(pd.__version__) < Version("1.3.0"):
+        pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")
+
+    df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
+    df1 = df1.astype("category")
+
+    df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]})
+    df2 = df2.astype("category")
+
+    # categories should be converted to pa.Array
+    assert pa.array(df1["x"]) == pa.array(df2["x"])
+    assert pa.array(df1["x"].cat.categories.values) == pa.array(
+        df2["x"].cat.categories.values)
+
+    path = str(tempdir / 'cat.parquet')
+    pq.write_table(pa.table(df1), path)
+    result = pq.read_table(path).to_pandas()
+
+    tm.assert_frame_equal(result, df2)
+
+
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_write_to_dataset_pandas_preserve_extensiondtypes(

diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
@@ -3283,6 +3283,7 @@ def __arrow_array__(self, type=None):
         pa.array(arr)
 
     # ARROW-7066 - allow ChunkedArray output
+    # GH-33727 - if num_chunks=1 return Array
     class MyArray2:
         def __init__(self, data):
             self.data = data
@@ -3292,7 +3293,21 @@ def __arrow_array__(self, type=None):
 
     arr = MyArray2(np.array([1, 2, 3], dtype='int64'))
     result = pa.array(arr)
-    expected = pa.chunked_array([[1, 2, 3]], type=pa.int64())
+    expected = pa.array([1, 2, 3], type=pa.int64())
+    assert result.equals(expected)
+
+    class MyArray3:
+        def __init__(self, data1, data2):
+            self.data1 = data1
+            self.data2 = data2
+
+        def __arrow_array__(self, type=None):
+            return pa.chunked_array([self.data1, self.data2], type=type)
+
+    np_arr = np.array([1, 2, 3], dtype='int64')
+    arr = MyArray3(np_arr, np_arr)
+    result = pa.array(arr)
+    expected = pa.chunked_array([[1, 2, 3], [1, 2, 3]], type=pa.int64())
     assert result.equals(expected)
 
 

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -3924,6 +3924,22 @@ def test_dictionary_from_pandas_specified_type():
     assert result.to_pylist() == ['a', 'b']
 
 
+def test_convert_categories_to_array_with_string_pyarrow_dtype():
+    # gh-33727: categories should be converted to pa.Array
+    if Version(pd.__version__) < Version("1.3.0"):
+        pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")
+
+    df = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
+    df = df.astype("category")
+    indices = pa.array(df['x'].cat.codes)
+    dictionary = pa.array(df["x"].cat.categories.values)
+    assert isinstance(dictionary, pa.Array)
+
+    expected = pa.Array.from_pandas(df['x'])
+    result = pa.DictionaryArray.from_arrays(indices, dictionary)
+    assert result == expected
+
+
 # ----------------------------------------------------------------------
 # Array protocol in pandas conversions tests