diff --git a/pypdf/_page.py b/pypdf/_page.py index 4d53d1b54..dcb796944 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -57,6 +57,7 @@ from ._utils import ( CompressedTransformationMatrix, File, + ImageFile, TransformationMatrixType, deprecation_no_replacement, deprecation_with_replacement, @@ -444,7 +445,7 @@ def createBlankPage( return PageObject.create_blank_page(pdf, width, height) @property - def images(self) -> List[File]: + def _old_images(self) -> List[File]: # deprecated """ Get a list of all images of the page. @@ -460,12 +461,93 @@ def images(self) -> List[File]: x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for obj in x_object: if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) + extension, byte_stream, img = _xobj_to_image(x_object[obj]) if extension is not None: filename = f"{obj[1:]}{extension}" images_extracted.append(File(name=filename, data=byte_stream)) return images_extracted + def _get_ids_image( + self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None + ) -> List[Union[str, List[str]]]: + if obj is None: + obj = self + if ancest is None: + ancest = [] + lst: List[Union[str, List[str]]] = [] + if PG.RESOURCES not in obj or RES.XOBJECT not in cast( + DictionaryObject, obj[PG.RESOURCES] + ): + return lst + + x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for o in x_object: + if x_object[o][IA.SUBTYPE] == "/Image": + lst.append(o if len(ancest) == 0 else ancest + [o]) + else: # is a form with possible images inside + lst.extend(self._get_ids_image(x_object[o], ancest + [o])) + return lst + + def _get_image( + self, + id: Union[str, List[str], Tuple[str]], + obj: Optional[DictionaryObject] = None, + ) -> ImageFile: + if obj is None: + obj = cast(DictionaryObject, self) + if isinstance(id, tuple): + id = list(id) + if isinstance(id, List) and len(id) == 1: + id = id[0] + try: + xobjs = cast( + DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] + ) + except KeyError: + raise + if isinstance(id, str): + extension, byte_stream, img = _xobj_to_image( + cast(DictionaryObject, xobjs[id]) + ) + f = ImageFile( + name=f"{id[1:]}{extension}", + data=byte_stream, + image=img, + indirect_reference=xobjs[id].indirect_reference, + ) + return f + else: # in a sub object + ids = id[1:] + return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) + + @property + def images(self) -> List[ImageFile]: + """ + Read-only property that emulates a list of files + Get a list of all images of the page. + + the key can be: + µan str (for top object) or a tuple for image within XObject forms + or an int + ex: + ``` + reader.pages[0].images[0] # return fist image + reader.pages[0].images['/I0'] # return image '/I0' + reader.pages[0].images['/TP1','/Image1'] # return image '/Image1' + within '/TP1' Xobject/Form + for img in reader.pages[0].images: # loop within all objects + ``` + + images.keys() and image.items() work + + The File object properties are: + .name : name of the object + .data : bytes of the object + .image : PIL Image Object + .indirect_reference : object reference + """ + return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore + @property def rotation(self) -> int: """ @@ -2242,3 +2324,60 @@ def _get_fonts_walk( _get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb) return fnt, emb # return the sets for each page + + +class _VirtualListImages(Sequence): + def __init__( + self, + ids_function: Callable[[], List[Union[str, List[str]]]], + get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], + ) -> None: + self.ids_function = ids_function + self.get_function = get_function + self.current = -1 + + def __len__(self) -> int: + return len(self.ids_function()) + + def keys(self) -> List[Union[str, List[str]]]: + return self.ids_function() + + def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: + return [(x, self[x]) for x in self.ids_function()] + + @overload + def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: + ... + + @overload + def __getitem__(self, index: slice) -> Sequence[ImageFile]: + ... + + def __getitem__( + self, index: Union[int, slice, str, List[str], Tuple[str]] + ) -> Union[ImageFile, Sequence[ImageFile]]: + lst = self.ids_function() + if isinstance(index, slice): + indices = range(*index.indices(len(self))) + lst = [lst[x] for x in indices] + cls = type(self) + return cls((lambda: lst), self.get_function) + if isinstance(index, (str, list, tuple)): + return self.get_function(index) + if not isinstance(index, int): + raise TypeError("invalid sequence indices type") + len_self = len(lst) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.get_function(lst[index]) + + def __iter__(self) -> Iterator[ImageFile]: + for i in range(len(self)): + yield self[i] + + def __str__(self) -> str: + p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] + return f"[{', '.join(p)}]" diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 4368b0a52..9883b30fa 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -496,7 +496,15 @@ class File: data: bytes def __str__(self) -> str: - return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))})" + return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})" def __repr__(self) -> str: - return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))}, hash: {hash(self.data)})" + return self.__str__()[:-1] + f", hash: {hash(self.data)})" + + +@dataclass +class ImageFile(File): + from .generic import IndirectObject + + image: Optional[Any] = None # optional ; direct PIL image access + indirect_reference: Optional[IndirectObject] = None # optional ; link to PdfObject diff --git a/pypdf/filters.py b/pypdf/filters.py index 4bece9c4f..89febcc19 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -38,7 +38,7 @@ import struct import zlib from io import BytesIO -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union, cast +from typing import Any, Dict, Optional, Tuple, Union, cast from ._utils import b_, deprecate_with_replacement, ord_, paeth_predictor from .constants import CcittFaxDecodeParameters as CCITT @@ -57,13 +57,12 @@ NullObject, ) -if TYPE_CHECKING: - try: - from typing import Literal # type: ignore[attr-defined] - except ImportError: - # PEP 586 introduced typing.Literal with Python 3.8 - # For older Python versions, the backport typing_extensions is necessary: - from typing_extensions import Literal # type: ignore[misc, assignment] +try: + from typing import Literal, TypeAlias # type: ignore[attr-defined] +except ImportError: + # PEP 586 introduced typing.Literal with Python 3.8 + # For older Python versions, the backport typing_extensions is necessary: + from typing_extensions import Literal, TypeAlias # type: ignore[misc, assignment] def decompress(data: bytes) -> bytes: @@ -158,8 +157,17 @@ def decode( math.ceil(columns * bits_per_component / 8) + 1 ) # number of bytes + # TIFF prediction: + if predictor == 2: + rowlength -= 1 # remove the predictor byte + bpp = rowlength // columns + str_data = bytearray(str_data) + for i in range(len(str_data)): + if i % rowlength >= bpp: + str_data[i] = (str_data[i] + str_data[i - bpp]) % 256 + str_data = bytes(str_data) # PNG prediction: - if 10 <= predictor <= 15: + elif 10 <= predictor <= 15: str_data = FlateDecode._decode_png_prediction(str_data, columns, rowlength) # type: ignore else: # unsupported predictor @@ -624,7 +632,10 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated return decode_stream_data(stream) -def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: +mode_str_type: TypeAlias = Literal["", "1", "RGB", "P", "L", "RGBA", "CMYK"] + + +def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]: """ Users need to have the pillow package installed. @@ -635,7 +646,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: x_object_obj: Returns: - Tuple[file extension, bytes] + Tuple[file extension, bytes, PIL.Image.Image] """ try: from PIL import Image @@ -735,4 +746,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() - return extension, data + try: # temporary try/except until other fixes of images + img = Image.open(BytesIO(data)) + except Exception: + img = None # type: ignore + return extension, data, img diff --git a/tests/test_filters.py b/tests/test_filters.py index 08e42ff26..e7cb60551 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -3,6 +3,7 @@ import sys from io import BytesIO from itertools import product as cartesian_product +from pathlib import Path from unittest.mock import patch import pytest @@ -31,6 +32,10 @@ string.whitespace, # Add more... ) +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + @pytest.mark.parametrize( ("predictor", "s"), list(cartesian_product([1], filter_inputs)) @@ -250,7 +255,7 @@ def test_image_without_imagemagic(): for page in reader.pages: with pytest.raises(ImportError) as exc: - page.images + page.images[0] assert exc.value.args[0] == ( "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" diff --git a/tests/test_page.py b/tests/test_page.py index 14ae7e5c1..7e913d679 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1134,3 +1134,29 @@ def test_pdf_pages_missing_type(): reader.pages[0] writer = PdfWriter(clone_from=reader) writer.pages[0] + + +@pytest.mark.enable_socket() +def test_image_new_property(): + url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" + name = "pdf_font_garbled.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.pages[0].images.keys() + # many tests disabled until other image fixes: + # reader.pages[0].images.items() + # reader.pages[0].images[0].name + reader.pages[0].images[-1].data + reader.pages[0].images["/TPL1", "/Image5"].image + # assert ( + # reader.pages[0].images["/I0"].indirect_reference.get_object() + # == reader.pages[0]["/Resources"]["/XObject"]["/I0"] + # ) + # list(reader.pages[0].images[0:2]) + with pytest.raises(TypeError): + reader.pages[0].images[b"0"] + with pytest.raises(IndexError): + reader.pages[0].images[9999] + # just for test coverage: + with pytest.raises(KeyError): + reader.pages[0]._get_image(["test"], reader.pages[0]) + assert list(PageObject(None, None).images) == []