From dc7270d91345ff5a5e623ee5ca627fd7b0ec6103 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Oct 2023 00:37:51 +0200 Subject: [PATCH 1/3] REGR: fix read_parquet with column of large strings (avoid overflow from concat) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_.py | 10 ++++++++-- pandas/tests/io/test_parquet.py | 11 +++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 8863bfa9f3f69..a7ef73ff1111d 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) +- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a50c95d33d444..410802007beb4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -228,11 +228,17 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks + results = [] + for arr in chunks: + arr = arr.to_numpy(zero_copy_only=False) + arr = ensure_string_array(arr, na_value=libmissing.NA) + results.append(arr) + if len(chunks) == 0: arr = np.array([], dtype=object) else: - arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = np.concatenate(results) + # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 581be53145228..8da791f054887 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1141,6 +1141,17 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_string_column_above_2GB(self, tmp_path, pa): + # https://github.com/pandas-dev/pandas/issues/55606 + # above 2GB of string data + v1 = b"x" * 100000000 + v2 = b"x" * 147483646 + df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") + df.to_parquet(tmp_path / "test.parquet") + result = read_parquet(tmp_path / "test.parquet") + assert result["strings"].dtype == "string" + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From 6a2e37272d623c2e92625ba6caa2b198c051f211 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Oct 2023 12:17:52 +0200 Subject: [PATCH 2/3] comment out test --- pandas/tests/io/test_parquet.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8da791f054887..0a72fd7bbec7d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1141,16 +1141,17 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) - @pytest.mark.slow - def test_string_column_above_2GB(self, tmp_path, pa): - # https://github.com/pandas-dev/pandas/issues/55606 - # above 2GB of string data - v1 = b"x" * 100000000 - v2 = b"x" * 147483646 - df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") - df.to_parquet(tmp_path / "test.parquet") - result = read_parquet(tmp_path / "test.parquet") - assert result["strings"].dtype == "string" + # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) + # @pytest.mark.slow + # def test_string_column_above_2GB(self, tmp_path, pa): + # # https://github.com/pandas-dev/pandas/issues/55606 + # # above 2GB of string data + # v1 = b"x" * 100000000 + # v2 = b"x" * 147483646 + # df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") + # df.to_parquet(tmp_path / "test.parquet") + # result = read_parquet(tmp_path / "test.parquet") + # assert result["strings"].dtype == "string" class TestParquetFastParquet(Base): From ea3f32b6e990e9a1e76c6a0c6c1e42c0d311e8d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Oct 2023 12:19:05 +0200 Subject: [PATCH 3/3] add comment --- pandas/core/arrays/string_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 410802007beb4..471b37eac783b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -230,6 +230,8 @@ def __from_arrow__( results = [] for arr in chunks: + # convert chunk by chunk to numpy and concatenate then, to avoid + # overflow for large string data when concatenating the pyarrow arrays arr = arr.to_numpy(zero_copy_only=False) arr = ensure_string_array(arr, na_value=libmissing.NA) results.append(arr)