From 752e368172bdc642a732e8dff33f69f9a0ec8bcc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 28 Nov 2023 12:47:19 +0100 Subject: [PATCH 01/16] Switch arrow type for string array to large string --- pandas/core/arrays/arrow/array.py | 6 +++- pandas/core/arrays/string_arrow.py | 32 ++++++++++++++++++++-- pandas/tests/arrays/string_/test_string.py | 28 +++++++++++++++---- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d162b66e5d369..1ae7a485e8f93 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -667,7 +667,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type other = self._box_pa(other) - if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ + if ( + pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_binary(pa_type) + ) and op in [ operator.add, roperator.radd, ]: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 96ebb4901f797..0600b5c06564a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -129,6 +129,9 @@ def __init__(self, values) -> None: super().__init__(values) self._dtype = StringDtype(storage=self._storage) + self._check_string_type() + + def _check_string_type(self): if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) and pa.types.is_string(self._pa_array.type.value_type) @@ -577,12 +580,37 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( values.type ): - values = pc.cast(values, pa.string()) + values = pc.cast(values, pa.large_string()) super().__init__(values) + def _check_string_type(self): + if not pa.types.is_large_string(self._pa_array.type) and not ( + pa.types.is_dictionary(self._pa_array.type) + and pa.types.is_large_string(self._pa_array.type.value_type) + ): + raise ValueError( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array + @classmethod def _result_converter(cls, values, na=None): if not isna(na): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 524a6632e5544..12ce6a1149d2d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -478,25 +478,35 @@ def test_fillna_args(dtype, request, arrow_string_storage): def test_arrow_array(dtype): # protocol added in 0.15.0 pa = pytest.importorskip("pyarrow") + import pyarrow.compute as pc data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - + if dtype.storage == "pyarrow_numpy": + expected = pc.cast(arr, pa.large_string()) assert arr.equals(expected) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2): +def test_arrow_roundtrip(dtype, string_storage2, request): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") + if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow": + request.applymarker( + pytest.mark.xfail(reason="can't store large string in pyarrow string array") + ) + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "pyarrow_numpy": + assert table.field("a").type == "large_string" + else: + assert table.field("a").type == "string" with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) @@ -507,14 +517,22 @@ def test_arrow_roundtrip(dtype, string_storage2): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks(dtype, string_storage2): +def test_arrow_load_from_zero_chunks(dtype, string_storage2, request): # GH-41040 pa = pytest.importorskip("pyarrow") + if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow": + request.applymarker( + pytest.mark.xfail(reason="can't store large string in pyarrow string array") + ) + data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "pyarrow_numpy": + assert table.field("a").type == "large_string" + else: + assert table.field("a").type == "string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage2): From d813e8d7219278521cab46db7e9bf7c1c7fd795c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 28 Nov 2023 20:51:13 +0100 Subject: [PATCH 02/16] Fix concat issue --- pandas/core/arrays/arrow/array.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1ae7a485e8f93..3b25f9cc3dc25 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1421,7 +1421,10 @@ def _concat_same_type(cls, to_concat) -> Self: chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] if to_concat[0].dtype == "string": # StringDtype has no attribute pyarrow_dtype - pa_dtype = pa.string() + if to_concat[0].dtype.storage == "pyarrow_numpy": + pa_dtype = pa.large_string() + else: + pa_dtype = pa.string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype arr = pa.chunked_array(chunks, type=pa_dtype) From ed07537d9f850661522b1db7fd75844ae52d8218 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:38:07 +0100 Subject: [PATCH 03/16] Update --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/core/arrays/arrow/array.py | 5 +- pandas/core/arrays/string_arrow.py | 62 ++++++++-------------- pandas/tests/arrays/string_/test_string.py | 28 ++++------ 4 files changed, 35 insertions(+), 62 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index dce776755ad7e..1f5df425a4390 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -196,6 +196,8 @@ Other enhancements - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- The dtype ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) + .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3b25f9cc3dc25..628ea2274e70d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1421,10 +1421,7 @@ def _concat_same_type(cls, to_concat) -> Self: chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] if to_concat[0].dtype == "string": # StringDtype has no attribute pyarrow_dtype - if to_concat[0].dtype.storage == "pyarrow_numpy": - pa_dtype = pa.large_string() - else: - pa_dtype = pa.string() + pa_dtype = pa.large_string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype arr = pa.chunked_array(chunks, type=pa_dtype) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0600b5c06564a..9de0c3bca6e25 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -126,20 +126,38 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr _storage = "pyarrow" def __init__(self, values) -> None: + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( + values.type + ): + values = pc.cast(values, pa.large_string()) + super().__init__(values) self._dtype = StringDtype(storage=self._storage) - self._check_string_type() - - def _check_string_type(self): - if not pa.types.is_string(self._pa_array.type) and not ( + if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_string(self._pa_array.type.value_type) + and pa.types.is_large_string(self._pa_array.type.value_type) ): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of string type" ) + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array + def __len__(self) -> int: """ Length of this array. @@ -577,40 +595,6 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" - def __init__(self, values) -> None: - _chk_pyarrow_available() - - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type - ): - values = pc.cast(values, pa.large_string()) - super().__init__(values) - - def _check_string_type(self): - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): - raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" - ) - - @classmethod - def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: - pa_scalar = super()._box_pa_scalar(value, pa_type) - if pa.types.is_string(pa_scalar.type) and pa_type is None: - pa_scalar = pc.cast(pa_scalar, pa.large_string()) - return pa_scalar - - @classmethod - def _box_pa_array( - cls, value, pa_type: pa.DataType | None = None, copy: bool = False - ) -> pa.Array | pa.ChunkedArray: - pa_array = super()._box_pa_array(value, pa_type) - if pa.types.is_string(pa_array.type) and pa_type is None: - pa_array = pc.cast(pa_array, pa.large_string()) - return pa_array - @classmethod def _result_converter(cls, values, na=None): if not isna(na): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 12ce6a1149d2d..0eaf461a7ebbb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -482,11 +482,11 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) - expected = pa.array(list(data), type=pa.string(), from_pandas=True) + expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - if dtype.storage == "pyarrow_numpy": - expected = pc.cast(arr, pa.large_string()) + if dtype.storage == "python": + expected = pc.cast(expected, pa.string()) assert arr.equals(expected) @@ -495,18 +495,13 @@ def test_arrow_roundtrip(dtype, string_storage2, request): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow": - request.applymarker( - pytest.mark.xfail(reason="can't store large string in pyarrow string array") - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage == "pyarrow_numpy": - assert table.field("a").type == "large_string" - else: + if dtype.storage == "python": assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) @@ -521,18 +516,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2, request): # GH-41040 pa = pytest.importorskip("pyarrow") - if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow": - request.applymarker( - pytest.mark.xfail(reason="can't store large string in pyarrow string array") - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage == "pyarrow_numpy": - assert table.field("a").type == "large_string" - else: + if dtype.storage == "python": assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage2): From c0c42a8c805fe585a81b1ef8e4ccbedea30b766c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:38:16 +0100 Subject: [PATCH 04/16] Update --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 1f5df425a4390..2dec368712f71 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -196,7 +196,7 @@ Other enhancements - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) -- The dtype ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) +- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) .. --------------------------------------------------------------------------- From 3196f32ad80dc10b3555972d9f7ef61c6b11051b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 22:55:34 +0100 Subject: [PATCH 05/16] Update tests --- pandas/tests/arrays/string_/test_string_arrow.py | 5 ++++- pandas/tests/io/json/test_pandas.py | 7 +++++++ pandas/tests/io/parser/test_read_fwf.py | 6 ++++++ pandas/tests/io/test_html.py | 5 +++++ pandas/tests/io/test_sql.py | 7 +++++++ pandas/tests/io/xml/test_xml.py | 7 +++++++ 6 files changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a801a845bc7be..14f2220a8034e 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -81,11 +81,14 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) +@pytest.mark.xfail( + "dictionary conversion does not seem to be implemented for large string in arrow" +) @pytest.mark.parametrize("chunked", [True, False]) def test_constructor_valid_string_type_value_dictionary(chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8())) + arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.large_string())) if chunked: arr = pa.chunked_array(arr) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 411cc90ba41a7..505a743c77afc 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2010,6 +2010,13 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 34cae289c0f22..62af4588ad4cd 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -965,6 +965,12 @@ def test_dtype_backend(string_storage, dtype_backend): if string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + arr = ArrowExtensionArray(pa.array(["a", "b"])) + arr_na = ArrowExtensionArray(pa.array([None, "a"])) else: pa = pytest.importorskip("pyarrow") arr = ArrowStringArray(pa.array(["a", "b"])) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index dcee52011a691..edf2b05f1c83f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -186,7 +186,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): if string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d7c69ff17749c..b5588898d696b 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index e4456b0a78e06..6f429c1ecbf8a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes( string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["x", "y"])) From e807652d9af775a0e88fd165aa5e2acb5b6ca6d3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 23:36:37 +0100 Subject: [PATCH 06/16] Fix test --- pandas/io/sql.py | 12 +++++++++--- pandas/tests/arrays/string_/test_string_arrow.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d4b6602d9f0eb..ff6d53bf61001 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -172,9 +172,15 @@ def _convert_arrays_to_dataframe( ) if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - arrays = [ - ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays - ] + import pyarrow.compute as pc + + result_arrays = [] + for arr in arrays: + pa_array = pa.array(arr, from_pandas=True) + if arr.dtype == "string": + pa_array = pc.cast(pa_array, pa.string()) + result_arrays.append(ArrowExtensionArray(pa_array)) + arrays = result_arrays if arrays: df = DataFrame(dict(zip(list(range(len(columns))), arrays))) df.columns = columns diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 14f2220a8034e..aedc2196d88b6 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -82,7 +82,7 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): @pytest.mark.xfail( - "dictionary conversion does not seem to be implemented for large string in arrow" + reason="dict conversion does not seem to be implemented for large string in arrow" ) @pytest.mark.parametrize("chunked", [True, False]) def test_constructor_valid_string_type_value_dictionary(chunked): From 6dc3f20afe7d925d386a6db7e97611c29d2d3a8c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Dec 2023 23:07:23 +0100 Subject: [PATCH 07/16] Fixup --- pandas/core/arrays/string_arrow.py | 1 + pandas/tests/io/test_clipboard.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index db30ee417f228..3d73605b67ec2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -127,6 +127,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr _storage = "pyarrow" def __init__(self, values) -> None: + _chk_pyarrow_available() if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( values.type ): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index b0bf046609162..d418612886a6c 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -363,8 +363,8 @@ def test_read_clipboard_dtype_backend( pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) else: string_array = ArrowStringArray(pa.array(["x", "y"])) From 848f7ed157eeaf615ed8d58972236f529767474b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 10 Dec 2023 00:12:26 +0100 Subject: [PATCH 08/16] Fixup --- pandas/tests/io/test_clipboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index d418612886a6c..3c0208fcc74ec 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -359,7 +359,7 @@ def test_read_clipboard_dtype_backend( string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) - elif dtype_backend == "pyarrow": + elif dtype_backend == "pyarrow" and engine != "c": pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray From 72448898a31cd660f2aaae0b9568281129ede088 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 10 Dec 2023 00:13:06 +0100 Subject: [PATCH 09/16] Fixup --- pandas/tests/io/test_sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9955ec8b10365..6645aefd4f0a7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3651,8 +3651,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] else: pa = pytest.importorskip("pyarrow") From 3d90cc71de6c4202f327862f9040a4fe1e5a43b9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 14 Dec 2023 22:07:34 +0100 Subject: [PATCH 10/16] Update pandas/tests/arrays/string_/test_string_arrow.py Co-authored-by: Joris Van den Bossche --- pandas/tests/arrays/string_/test_string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index f2862e0f440f1..a5907335f3e5e 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -90,7 +90,7 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): def test_constructor_valid_string_type_value_dictionary(chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.large_string())) + arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) From 46d7f16ddbbc170c4f85071d7ee0f927a07b5319 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 14 Dec 2023 22:08:32 +0100 Subject: [PATCH 11/16] Update pandas/core/arrays/string_arrow.py Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2ef5884ebab5e..eb8b12c8a6e56 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -142,7 +142,7 @@ def __init__(self, values) -> None: and pa.types.is_large_string(self._pa_array.type.value_type) ): raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) @classmethod From 3fdf25687701c261b087e9288da0459c8e5e1c7a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 14 Dec 2023 22:18:12 +0100 Subject: [PATCH 12/16] Update string_arrow.py --- pandas/core/arrays/string_arrow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index eb8b12c8a6e56..17c185ed820b9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -142,7 +142,8 @@ def __init__(self, values) -> None: and pa.types.is_large_string(self._pa_array.type.value_type) ): raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of large_string type" + "ArrowStringArray requires a PyArrow (chunked) array of " + "large_string type" ) @classmethod From c2bd9d2e6fc0251477da7db9ff834ed6d05a2fb4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 14 Dec 2023 23:13:00 +0100 Subject: [PATCH 13/16] FIxup --- pandas/tests/arrays/string_/test_string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a5907335f3e5e..99ec327c4c142 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -62,7 +62,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) @@ -77,7 +77,7 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): arr = pa.chunked_array(arr) msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) From a22e625148292fec19223165d6624d62198b7bdd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 19:29:06 +0100 Subject: [PATCH 14/16] Update --- pandas/io/sql.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 2221ed6ff72fd..a51b8d562d7bf 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -172,13 +172,12 @@ def _convert_arrays_to_dataframe( ) if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - import pyarrow.compute as pc result_arrays = [] for arr in arrays: pa_array = pa.array(arr, from_pandas=True) if arr.dtype == "string": - pa_array = pc.cast(pa_array, pa.string()) + pa_array = pa_array.cast(pa.string()) result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] if arrays: From 847b74c123145c14781058e7892e417d874d2e7c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 19:44:00 +0100 Subject: [PATCH 15/16] Add todo --- pandas/io/sql.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index a51b8d562d7bf..b0fa6bc6e90c4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -177,6 +177,9 @@ def _convert_arrays_to_dataframe( for arr in arrays: pa_array = pa.array(arr, from_pandas=True) if arr.dtype == "string": + # TODO: Arrow still infers strings arrays as regular strings instead + # of large_string, which is what we preserver everywhere else for + # dtype_backend="pyarrow". We may want to reconsider this pa_array = pa_array.cast(pa.string()) result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] From 47fda87a68436616a8727f787d4d760762be96f0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 21:08:29 +0100 Subject: [PATCH 16/16] Fixup --- pandas/core/arrays/arrow/array.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 50c5b722869bc..d7bec102c43ca 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -291,6 +291,7 @@ def _from_sequence_of_strings( pa_type is None or pa.types.is_binary(pa_type) or pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) ): # pa_type is None: Let pa.array infer # pa_type is string/binary: scalars already correct type @@ -632,7 +633,9 @@ def __invert__(self) -> Self: # This is a bit wise op for integer types if pa.types.is_integer(self._pa_array.type): return type(self)(pc.bit_wise_not(self._pa_array)) - elif pa.types.is_string(self._pa_array.type): + elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): # Raise TypeError instead of pa.ArrowNotImplementedError raise TypeError("__invert__ is not supported for string dtypes") else: @@ -713,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs): result = pc.binary_repeat(binary, pa_integral) return type(self)(result) elif ( - pa.types.is_string(other.type) or pa.types.is_binary(other.type) + pa.types.is_string(other.type) + or pa.types.is_binary(other.type) + or pa.types.is_large_string(other.type) ) and op in [operator.mul, roperator.rmul]: binary = other integral = self._pa_array @@ -2257,7 +2262,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return type(self)(result) def _str_join(self, sep: str): - if pa.types.is_string(self._pa_array.type): + if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): result = self._apply_elementwise(list) result = pa.chunked_array(result, type=pa.list_(pa.string())) else: