diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 70039cc697b8a..e0aa4d1306c95 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -236,6 +236,8 @@ Other enhancements - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) - Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) + .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 633efe43fce1a..d7bec102c43ca 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -291,6 +291,7 @@ def _from_sequence_of_strings( pa_type is None or pa.types.is_binary(pa_type) or pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) ): # pa_type is None: Let pa.array infer # pa_type is string/binary: scalars already correct type @@ -632,7 +633,9 @@ def __invert__(self) -> Self: # This is a bit wise op for integer types if pa.types.is_integer(self._pa_array.type): return type(self)(pc.bit_wise_not(self._pa_array)) - elif pa.types.is_string(self._pa_array.type): + elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): # Raise TypeError instead of pa.ArrowNotImplementedError raise TypeError("__invert__ is not supported for string dtypes") else: @@ -692,7 +695,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type other = self._box_pa(other) - if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + if ( + pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_binary(pa_type) + ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) if op is operator.add: @@ -709,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs): result = pc.binary_repeat(binary, pa_integral) return type(self)(result) elif ( - pa.types.is_string(other.type) or pa.types.is_binary(other.type) + pa.types.is_string(other.type) + or pa.types.is_binary(other.type) + or pa.types.is_large_string(other.type) ) and op in [operator.mul, roperator.rmul]: binary = other integral = self._pa_array @@ -1471,7 +1480,7 @@ def _concat_same_type(cls, to_concat) -> Self: chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] if to_concat[0].dtype == "string": # StringDtype has no attribute pyarrow_dtype - pa_dtype = pa.string() + pa_dtype = pa.large_string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype arr = pa.chunked_array(chunks, type=pa_dtype) @@ -2253,7 +2262,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return type(self)(result) def _str_join(self, sep: str): - if pa.types.is_string(self._pa_array.type): + if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): result = self._apply_elementwise(list) result = pa.chunked_array(result, type=pa.list_(pa.string())) else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 32ab3054c0f51..56732619a2d29 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr _storage = "pyarrow" def __init__(self, values) -> None: + _chk_pyarrow_available() + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( + values.type + ): + values = pc.cast(values, pa.large_string()) + super().__init__(values) self._dtype = StringDtype(storage=self._storage) - if not pa.types.is_string(self._pa_array.type) and not ( + if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_string(self._pa_array.type.value_type) + and pa.types.is_large_string(self._pa_array.type.value_type) ): raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of " + "large_string type" ) + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array + def __len__(self) -> int: """ Length of this array. @@ -574,15 +597,6 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" - def __init__(self, values) -> None: - _chk_pyarrow_available() - - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( - values.type - ): - values = pc.cast(values, pa.string()) - super().__init__(values) - @classmethod def _result_converter(cls, values, na=None): if not isna(na): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 12118d1488932..b0fa6bc6e90c4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -172,9 +172,17 @@ def _convert_arrays_to_dataframe( ) if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - arrays = [ - ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays - ] + + result_arrays = [] + for arr in arrays: + pa_array = pa.array(arr, from_pandas=True) + if arr.dtype == "string": + # TODO: Arrow still infers strings arrays as regular strings instead + # of large_string, which is what we preserver everywhere else for + # dtype_backend="pyarrow". We may want to reconsider this + pa_array = pa_array.cast(pa.string()) + result_arrays.append(ArrowExtensionArray(pa_array)) + arrays = result_arrays # type: ignore[assignment] if arrays: df = DataFrame(dict(zip(list(range(len(columns))), arrays))) df.columns = columns diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 41255b2516e7e..320bdca60a932 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage): def test_arrow_array(dtype): # protocol added in 0.15.0 pa = pytest.importorskip("pyarrow") + import pyarrow.compute as pc data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) - expected = pa.array(list(data), type=pa.string(), from_pandas=True) + expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - + if dtype.storage == "python": + expected = pc.cast(expected, pa.string()) assert arr.equals(expected) @@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) @@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks( data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage2): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 222b77cb4e94f..d7811b6fed883 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) @@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): arr = pa.chunked_array(arr) msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) +@pytest.mark.xfail( + reason="dict conversion does not seem to be implemented for large string in arrow" +) @pytest.mark.parametrize("chunked", [True, False]) def test_constructor_valid_string_type_value_dictionary(chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8())) + arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 58a1e51d31b74..0eefb0b52c483 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend( string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 8566b87ef4292..bed2b5e10a6f7 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend): if string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + arr = ArrowExtensionArray(pa.array(["a", "b"])) + arr_na = ArrowExtensionArray(pa.array([None, "a"])) else: pa = pytest.importorskip("pyarrow") arr = ArrowStringArray(pa.array(["a", "b"])) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 8564f09ef7ae9..3c0208fcc74ec 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend( string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow" and engine != "c": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: string_array = ArrowStringArray(pa.array(["x", "y"])) string_array_na = ArrowStringArray(pa.array(["x", None])) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 15c5953e79bda..22a7d3b83a459 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f0256316e1689..607357e709b6e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): if string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e3272e5f5902d..6645aefd4f0a7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index e4456b0a78e06..6f429c1ecbf8a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes( string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["x", "y"]))