Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch arrow type for string array to large string #56220

Merged
merged 19 commits into from
Dec 21, 2023
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ Other enhancements
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)


.. ---------------------------------------------------------------------------
.. _whatsnew_220.notable_bug_fixes:
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
pa_type = self._pa_array.type
other = self._box_pa(other)

if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
if op is operator.add:
Expand Down Expand Up @@ -1471,7 +1475,7 @@ def _concat_same_type(cls, to_concat) -> Self:
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
if to_concat[0].dtype == "string":
# StringDtype has no attribute pyarrow_dtype
pa_dtype = pa.string()
pa_dtype = pa.large_string()
else:
pa_dtype = to_concat[0].dtype.pyarrow_dtype
arr = pa.chunked_array(chunks, type=pa_dtype)
Expand Down
38 changes: 26 additions & 12 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
_storage = "pyarrow"

def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
):
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage)

if not pa.types.is_string(self._pa_array.type) and not (
if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_string(self._pa_array.type.value_type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
)

@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar

@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array

def __len__(self) -> int:
"""
Length of this array.
Expand Down Expand Up @@ -574,15 +597,6 @@ def _rank(
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"

def __init__(self, values) -> None:
_chk_pyarrow_available()

if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
values.type
):
values = pc.cast(values, pa.string())
super().__init__(values)

@classmethod
def _result_converter(cls, values, na=None):
if not isna(na):
Expand Down
11 changes: 8 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,14 @@ def _convert_arrays_to_dataframe(
)
if dtype_backend == "pyarrow":
pa = import_optional_dependency("pyarrow")
arrays = [
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
]

result_arrays = []
for arr in arrays:
pa_array = pa.array(arr, from_pandas=True)
if arr.dtype == "string":
pa_array = pa_array.cast(pa.string())
result_arrays.append(ArrowExtensionArray(pa_array))
arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
df.columns = columns
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage):
def test_arrow_array(dtype):
# protocol added in 0.15.0
pa = pytest.importorskip("pyarrow")
import pyarrow.compute as pc

data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
expected = pa.chunked_array(expected)

if dtype.storage == "python":
expected = pc.cast(expected, pa.string())
assert arr.equals(expected)


Expand All @@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
Expand All @@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks(
data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
else:
msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)
Expand All @@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
arr = pa.chunked_array(arr)

msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)


@pytest.mark.xfail(
reason="dict conversion does not seem to be implemented for large string in arrow"
)
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_type_value_dictionary(chunked):
pa = pytest.importorskip("pyarrow")

arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8()))
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)

Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend(
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend):
if string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

arr = ArrowExtensionArray(pa.array(["a", "b"]))
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
else:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArray(pa.array(["a", "b"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow" and engine != "c":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
string_array = ArrowStringArray(pa.array(["x", "y"]))
string_array_na = ArrowStringArray(pa.array(["x", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["x", "y"]))
Expand Down
Loading