From 5af2972665a8cbbd77f914832866eb206151b8c6 Mon Sep 17 00:00:00 2001 From: sunlight798 <3281498087@qq.com> Date: Sat, 7 Dec 2024 22:33:39 +0800 Subject: [PATCH 1/6] BUG: Fix multi-index on columns with bool level values does not roundtrip through parquet --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/dtypes/astype.py | 7 +++++++ pandas/tests/io/test_parquet.py | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab5746eca1b18..0ace6f681f3a7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -709,6 +709,7 @@ I/O - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) +- Bug in :meth:`read_parquet` raising ``ValueError`` if the multi-index contains a level with bools and if that multi-index is on the columns, then while the parquet can be written with the ``pyarrow`` engine, it cannot be read back in using ``pyarrow``. (:issue:`60508`) Period ^^^^^^ diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 086f7d2da6640..9f9298b4d4a79 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -125,6 +125,13 @@ def _astype_nansafe( ) raise ValueError(msg) + if arr.dtype == object and dtype == bool: + # If the dtype is bool and the array is object, we need to replace the False and True of the object type in the ndarray with the bool type + # to ensure that the type conversion is correct + arr[arr == "False"] = np.False_ + arr[arr == "True"] = np.True_ + return arr.astype(dtype, copy=copy) + if copy or arr.dtype == object or dtype == object: # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7919bb956dc7a..3354f9353a309 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1468,3 +1468,24 @@ def test_invalid_dtype_backend(self, engine): df.to_parquet(path) with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") + + def test_bool_multiIndex(self, tmp_path, pa): + # GH 60508 + df = pd.DataFrame( + [ + [1, 2], + [4, 5], + ], + columns=pd.MultiIndex.from_tuples( + [ + (True, 'B'), + (False, 'C'), + ] + ) + ) + df.to_parquet( + path=tmp_path, + engine=pa, + ) + result = pd.read_parquet(tmp_path, engine=pa) + tm.assert_frame_equal(result, df) From 4b1b9b8f8998a50beca8c8c42bdb8bf7dae3ce03 Mon Sep 17 00:00:00 2001 From: sunlight798 <3281498087@qq.com> Date: Sat, 7 Dec 2024 23:06:32 +0800 Subject: [PATCH 2/6] BUG: Modify the comment format. --- pandas/core/dtypes/astype.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 9f9298b4d4a79..2a1b3a16c9d67 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -126,8 +126,9 @@ def _astype_nansafe( raise ValueError(msg) if arr.dtype == object and dtype == bool: - # If the dtype is bool and the array is object, we need to replace the False and True of the object type in the ndarray with the bool type - # to ensure that the type conversion is correct + # If the dtype is bool and the array is object, we need to replace + # the False and True of the object type in the ndarray with the + # bool type to ensure that the type conversion is correct arr[arr == "False"] = np.False_ arr[arr == "True"] = np.True_ return arr.astype(dtype, copy=copy) From 81596e7790f494e9b23d3ad15ac10f8d73c47449 Mon Sep 17 00:00:00 2001 From: sunlight798 <3281498087@qq.com> Date: Sat, 7 Dec 2024 23:22:09 +0800 Subject: [PATCH 3/6] BUG: Modify the test function. --- pandas/tests/io/test_parquet.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3354f9353a309..bbcd733d48bb3 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1472,20 +1472,12 @@ def test_invalid_dtype_backend(self, engine): def test_bool_multiIndex(self, tmp_path, pa): # GH 60508 df = pd.DataFrame( - [ - [1, 2], - [4, 5], - ], - columns=pd.MultiIndex.from_tuples( - [ - (True, 'B'), - (False, 'C'), - ] - ) + [[1, 2], [4, 5]], + columns=pd.MultiIndex.from_tuples([(True, 'B'), (False, 'C')]), ) df.to_parquet( path=tmp_path, engine=pa, ) - result = pd.read_parquet(tmp_path, engine=pa) + result = read_parquet(tmp_path, engine=pa) tm.assert_frame_equal(result, df) From 912dca66424aeab48bf2b247633f586614b0aeb2 Mon Sep 17 00:00:00 2001 From: sunlight798 <3281498087@qq.com> Date: Sat, 7 Dec 2024 23:27:27 +0800 Subject: [PATCH 4/6] BUG: Modify the v3.0.0.rst file. --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0ace6f681f3a7..9427d410e700a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -709,7 +709,7 @@ I/O - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) -- Bug in :meth:`read_parquet` raising ``ValueError`` if the multi-index contains a level with bools and if that multi-index is on the columns, then while the parquet can be written with the ``pyarrow`` engine, it cannot be read back in using ``pyarrow``. (:issue:`60508`) +- Bug in :meth:`read_parquet` raising ``ValueError`` if the multi-index contains a level with bools and if that multi-index is on the columns, then while the parquet can be written with the ``pyarrow`` engine, it cannot be read back in using ``pyarrow``. (:issue:`60508`) Period ^^^^^^ From a22e7b9229bf5f3f45cd050bb6e038d85ebb15df Mon Sep 17 00:00:00 2001 From: sunlight798 <3281498087@qq.com> Date: Sun, 8 Dec 2024 08:57:15 +0800 Subject: [PATCH 5/6] BUG: Modify the test_bool_multiIndex_roundtrip_through_parquet function --- pandas/tests/io/test_parquet.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index bbcd733d48bb3..21852169fea4a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1469,15 +1469,14 @@ def test_invalid_dtype_backend(self, engine): with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") - def test_bool_multiIndex(self, tmp_path, pa): + def test_bool_multiIndex_roundtrip_through_parquet(self, pa): # GH 60508 df = pd.DataFrame( [[1, 2], [4, 5]], columns=pd.MultiIndex.from_tuples([(True, 'B'), (False, 'C')]), ) - df.to_parquet( - path=tmp_path, - engine=pa, - ) - result = read_parquet(tmp_path, engine=pa) + with tm.ensure_clean("test.parquet") as path: + df.to_parquet(f, engine=pa) + + result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) From b9b3455ee75b3d2a4569fdcb7ecb5a6eee0249d1 Mon Sep 17 00:00:00 2001 From: sunlight798 <3281498087@qq.com> Date: Sun, 8 Dec 2024 09:14:16 +0800 Subject: [PATCH 6/6] BUG: Modify the test_bool_multiIndex_roundtrip_through_parquet function --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 21852169fea4a..04e9b9906c204 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1476,7 +1476,7 @@ def test_bool_multiIndex_roundtrip_through_parquet(self, pa): columns=pd.MultiIndex.from_tuples([(True, 'B'), (False, 'C')]), ) with tm.ensure_clean("test.parquet") as path: - df.to_parquet(f, engine=pa) + df.to_parquet(path, engine=pa) result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df)