Skip to content

Commit

Permalink
Parquet/Feather IO: disable PyExtensionType autoload (pandas-dev#55894)
Browse files Browse the repository at this point in the history
* Parquet/Feather IO: disable PyExtensionType autoload

* don't install hotfix for pyarrow >= 14.0.1

* move patching to extension type definitions

* expand error message

* fix compat for pyarrow not installed

* add whatsnew
  • Loading branch information
jorisvandenbossche authored Nov 9, 2023
1 parent a41b545 commit 851fea0
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Bug fixes
~~~~~~~~~
- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`)
- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`)
- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 <https://www.cve.org/CVERecord?id=CVE-2023-47248>`__ (:issue:`55894`)

.. ---------------------------------------------------------------------------
.. _whatsnew_213.other:
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
pa_version_under11p0,
pa_version_under13p0,
pa_version_under14p0,
pa_version_under14p1,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -184,6 +185,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
"pa_version_under11p0",
"pa_version_under13p0",
"pa_version_under14p0",
"pa_version_under14p1",
"IS64",
"ISMUSL",
"PY310",
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
pa_version_under12p0 = _palv < Version("12.0.0")
pa_version_under13p0 = _palv < Version("13.0.0")
pa_version_under14p0 = _palv < Version("14.0.0")
pa_version_under14p1 = _palv < Version("14.0.1")
except ImportError:
pa_version_under10p1 = True
pa_version_under11p0 = True
pa_version_under12p0 = True
pa_version_under13p0 = True
pa_version_under14p0 = True
pa_version_under14p1 = True
60 changes: 60 additions & 0 deletions pandas/core/arrays/arrow/extension_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import pyarrow

from pandas.compat import pa_version_under14p1

from pandas.core.dtypes.dtypes import (
IntervalDtype,
PeriodDtype,
Expand Down Expand Up @@ -112,3 +114,61 @@ def to_pandas_dtype(self) -> IntervalDtype:
# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
pyarrow.register_extension_type(_interval_type)


_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
"""


def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if not pa_version_under14p1:
return

# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return

class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""

@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools

out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)

pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)

pyarrow._hotfix_installed = True


patch_pyarrow()
3 changes: 3 additions & 0 deletions pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ def read_feather(
import_optional_dependency("pyarrow")
from pyarrow import feather

# import utils to register the pyarrow extension types
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401

check_dtype_backend(dtype_backend)

with get_handle(
Expand Down

0 comments on commit 851fea0

Please sign in to comment.