Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch arrow type for string array to large string #56220

Merged
merged 19 commits into from
Dec 21, 2023
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ Other enhancements
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)


.. ---------------------------------------------------------------------------
.. _whatsnew_220.notable_bug_fixes:
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
pa_type = self._pa_array.type
other = self._box_pa(other)

if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
):
if op in [operator.add, roperator.radd, operator.mul, roperator.rmul]:
sep = pa.scalar("", type=pa_type)
if op is operator.add:
Expand Down Expand Up @@ -1424,7 +1428,7 @@ def _concat_same_type(cls, to_concat) -> Self:
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
if to_concat[0].dtype == "string":
# StringDtype has no attribute pyarrow_dtype
pa_dtype = pa.string()
pa_dtype = pa.large_string()
else:
pa_dtype = to_concat[0].dtype.pyarrow_dtype
arr = pa.chunked_array(chunks, type=pa_dtype)
Expand Down
35 changes: 24 additions & 11 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,39 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
_storage = "pyarrow"

def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
):
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage)

if not pa.types.is_string(self._pa_array.type) and not (
if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_string(self._pa_array.type.value_type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
phofl marked this conversation as resolved.
Show resolved Hide resolved
)

@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar

@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array

def __len__(self) -> int:
"""
Length of this array.
Expand Down Expand Up @@ -576,15 +598,6 @@ def _rank(
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"

def __init__(self, values) -> None:
_chk_pyarrow_available()

if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
values.type
):
values = pc.cast(values, pa.string())
super().__init__(values)

@classmethod
def _result_converter(cls, values, na=None):
if not isna(na):
Expand Down
12 changes: 9 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,15 @@ def _convert_arrays_to_dataframe(
)
if dtype_backend == "pyarrow":
pa = import_optional_dependency("pyarrow")
arrays = [
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
]
import pyarrow.compute as pc

result_arrays = []
for arr in arrays:
pa_array = pa.array(arr, from_pandas=True)
if arr.dtype == "string":
pa_array = pc.cast(pa_array, pa.string())
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
result_arrays.append(ArrowExtensionArray(pa_array))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for this cast? (and maybe add a comment about it)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

arrow is inferring this as regular strings, I think we had failing tests without this cast

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I'm still confused about this as well. if arr.dtype == "string": we are still casting to pa.string()? What would the result type of pa.array(arr, from_pandas=True)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm the comment above was incorrect, its like this:

We are now using large_string in our String Extension arrays, e.g. if you convert this to an ArrowExtensionArray it will also be large_string. This is inconsistent with the other I/O methods where ArrowExtensionArray is still pa.string, that's why I am casting it back here.

I am happy to change this as well, but rather in a follow up

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah okay that makes sense. I'm OK with this then but would be good to have a # TODO noting we may want to keep large_string here in the future

arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
df.columns = columns
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,15 @@ def test_fillna_args(dtype, request, arrow_string_storage):
def test_arrow_array(dtype):
# protocol added in 0.15.0
pa = pytest.importorskip("pyarrow")
import pyarrow.compute as pc

data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
expected = pa.chunked_array(expected)

if dtype.storage == "python":
expected = pc.cast(expected, pa.string())
assert arr.equals(expected)


Expand All @@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
Expand All @@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks(
data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,14 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
ArrowStringArray(arr)


@pytest.mark.xfail(
reason="dict conversion does not seem to be implemented for large string in arrow"
)
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_type_value_dictionary(chunked):
pa = pytest.importorskip("pyarrow")

arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8()))
arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.large_string()))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.large_string()))
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()

(it's only the python->arrow converter that doesn't seem to implement this, but creating a dictionary array with large string in pyarrow itself is certainly supported)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additionally, it looks a bit strange that we actually allow creating a string column backed by a dictionary array. It would be nice that long-term we support this, but right now many operations will just fail (eg all string compute functions from pyarrow will fail on a dictionary[string] type).

I think for fixing #53951, instead of allowing dictionary to pass through, we should rather convert the dictionary to a plain string array?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do this as a follow up, but I don't think that this is a real use case anyway

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The report in #53951 is a real use case, though (and that will now create such dictionary-backed string column), AFAIU

But indeed for a different issue/PR

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this also happening on main? maybe I am misunderstanding something

if chunked:
arr = pa.chunked_array(arr)

Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend(
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,12 @@ def test_dtype_backend(string_storage, dtype_backend):
if string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

arr = ArrowExtensionArray(pa.array(["a", "b"]))
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
else:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArray(pa.array(["a", "b"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow" and engine != "c":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
string_array = ArrowStringArray(pa.array(["x", "y"]))
string_array_na = ArrowStringArray(pa.array(["x", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["x", "y"]))
Expand Down
Loading