From 851fea0ea38985cd7d5e0a3b7a7a7539b5883307 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Nov 2023 22:22:04 +0100 Subject: [PATCH] Parquet/Feather IO: disable PyExtensionType autoload (#55894) * Parquet/Feather IO: disable PyExtensionType autoload * don't install hotfix for pyarrow >= 14.0.1 * move patching to extension type definitions * expand error message * fix compat for pyarrow not installed * add whatsnew --- doc/source/whatsnew/v2.1.3.rst | 1 + pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/core/arrays/arrow/extension_types.py | 60 +++++++++++++++++++++ pandas/io/feather_format.py | 3 ++ 5 files changed, 68 insertions(+) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 6413e16afd800..531559c259b44 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -23,6 +23,7 @@ Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index ea8cfb7cc144b..738442fab8c70 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -29,6 +29,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: @@ -184,6 +185,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index d125904ba83f8..8dcb2669aa663 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -13,9 +13,11 @@ pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True + pa_version_under14p1 = True diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 7814a77a1cdc5..72bfd6f2212f8 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,61 @@ def to_pandas_dtype(self) -> IntervalDtype: # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..c451cd6c139ed 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -117,6 +117,9 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 + check_dtype_backend(dtype_backend) with get_handle(