ENH: Extend images interface by returning an ImageFile(File) class (#…

…1848) Extends the current image with new attributes (image/indirect_reference).
py-pdf · Jun 13, 2023 · 4b6d864 · 4b6d864
1 parent b128846
commit 4b6d864
Show file tree

Hide file tree

Showing 5 changed files with 210 additions and 17 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -57,6 +57,7 @@
 from ._utils import (
     CompressedTransformationMatrix,
     File,
+    ImageFile,
     TransformationMatrixType,
     deprecation_no_replacement,
     deprecation_with_replacement,
@@ -444,7 +445,7 @@ def createBlankPage(
         return PageObject.create_blank_page(pdf, width, height)
 
     @property
-    def images(self) -> List[File]:
+    def _old_images(self) -> List[File]:  # deprecated
         """
         Get a list of all images of the page.
 
@@ -460,12 +461,93 @@ def images(self) -> List[File]:
         x_object = self[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
         for obj in x_object:
             if x_object[obj][IA.SUBTYPE] == "/Image":
-                extension, byte_stream = _xobj_to_image(x_object[obj])
+                extension, byte_stream, img = _xobj_to_image(x_object[obj])
                 if extension is not None:
                     filename = f"{obj[1:]}{extension}"
                     images_extracted.append(File(name=filename, data=byte_stream))
         return images_extracted
 
+    def _get_ids_image(
+        self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
+    ) -> List[Union[str, List[str]]]:
+        if obj is None:
+            obj = self
+        if ancest is None:
+            ancest = []
+        lst: List[Union[str, List[str]]] = []
+        if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
+            DictionaryObject, obj[PG.RESOURCES]
+        ):
+            return lst
+
+        x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
+        for o in x_object:
+            if x_object[o][IA.SUBTYPE] == "/Image":
+                lst.append(o if len(ancest) == 0 else ancest + [o])
+            else:  # is a form with possible images inside
+                lst.extend(self._get_ids_image(x_object[o], ancest + [o]))
+        return lst
+
+    def _get_image(
+        self,
+        id: Union[str, List[str], Tuple[str]],
+        obj: Optional[DictionaryObject] = None,
+    ) -> ImageFile:
+        if obj is None:
+            obj = cast(DictionaryObject, self)
+        if isinstance(id, tuple):
+            id = list(id)
+        if isinstance(id, List) and len(id) == 1:
+            id = id[0]
+        try:
+            xobjs = cast(
+                DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
+            )
+        except KeyError:
+            raise
+        if isinstance(id, str):
+            extension, byte_stream, img = _xobj_to_image(
+                cast(DictionaryObject, xobjs[id])
+            )
+            f = ImageFile(
+                name=f"{id[1:]}{extension}",
+                data=byte_stream,
+                image=img,
+                indirect_reference=xobjs[id].indirect_reference,
+            )
+            return f
+        else:  # in a sub object
+            ids = id[1:]
+            return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
+
+    @property
+    def images(self) -> List[ImageFile]:
+        """
+            Read-only property that emulates a list of files
+            Get a list of all images of the page.
+
+            the key can be:
+              µan str (for top object) or a tuple for image within XObject forms
+              or an int
+        ex:
+        ```
+        reader.pages[0].images[0]        # return fist image
+        reader.pages[0].images['/I0']    # return image '/I0'
+        reader.pages[0].images['/TP1','/Image1'] # return image '/Image1'
+                                                        within '/TP1' Xobject/Form
+        for img in reader.pages[0].images: # loop within all objects
+        ```
+
+        images.keys() and image.items() work
+
+        The File object properties are:
+            .name : name of the object
+            .data : bytes of the object
+            .image  : PIL Image Object
+            .indirect_reference : object reference
+        """
+        return _VirtualListImages(self._get_ids_image, self._get_image)  # type: ignore
+
     @property
     def rotation(self) -> int:
         """
@@ -2242,3 +2324,60 @@ def _get_fonts_walk(
         _get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb)
 
     return fnt, emb  # return the sets for each page
+
+
+class _VirtualListImages(Sequence):
+    def __init__(
+        self,
+        ids_function: Callable[[], List[Union[str, List[str]]]],
+        get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile],
+    ) -> None:
+        self.ids_function = ids_function
+        self.get_function = get_function
+        self.current = -1
+
+    def __len__(self) -> int:
+        return len(self.ids_function())
+
+    def keys(self) -> List[Union[str, List[str]]]:
+        return self.ids_function()
+
+    def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]:
+        return [(x, self[x]) for x in self.ids_function()]
+
+    @overload
+    def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile:
+        ...
+
+    @overload
+    def __getitem__(self, index: slice) -> Sequence[ImageFile]:
+        ...
+
+    def __getitem__(
+        self, index: Union[int, slice, str, List[str], Tuple[str]]
+    ) -> Union[ImageFile, Sequence[ImageFile]]:
+        lst = self.ids_function()
+        if isinstance(index, slice):
+            indices = range(*index.indices(len(self)))
+            lst = [lst[x] for x in indices]
+            cls = type(self)
+            return cls((lambda: lst), self.get_function)
+        if isinstance(index, (str, list, tuple)):
+            return self.get_function(index)
+        if not isinstance(index, int):
+            raise TypeError("invalid sequence indices type")
+        len_self = len(lst)
+        if index < 0:
+            # support negative indexes
+            index = len_self + index
+        if index < 0 or index >= len_self:
+            raise IndexError("sequence index out of range")
+        return self.get_function(lst[index])
+
+    def __iter__(self) -> Iterator[ImageFile]:
+        for i in range(len(self)):
+            yield self[i]
+
+    def __str__(self) -> str:
+        p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
+        return f"[{', '.join(p)}]"
diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -496,7 +496,15 @@ class File:
     data: bytes
 
     def __str__(self) -> str:
-        return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
+        return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
 
     def __repr__(self) -> str:
-        return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))}, hash: {hash(self.data)})"
+        return self.__str__()[:-1] + f", hash: {hash(self.data)})"
+
+
+@dataclass
+class ImageFile(File):
+    from .generic import IndirectObject
+
+    image: Optional[Any] = None  # optional ; direct PIL image access
+    indirect_reference: Optional[IndirectObject] = None  # optional ; link to PdfObject
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -38,7 +38,7 @@
 import struct
 import zlib
 from io import BytesIO
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union, cast
+from typing import Any, Dict, Optional, Tuple, Union, cast
 
 from ._utils import b_, deprecate_with_replacement, ord_, paeth_predictor
 from .constants import CcittFaxDecodeParameters as CCITT
@@ -57,13 +57,12 @@
     NullObject,
 )
 
-if TYPE_CHECKING:
-    try:
-        from typing import Literal  # type: ignore[attr-defined]
-    except ImportError:
-        # PEP 586 introduced typing.Literal with Python 3.8
-        # For older Python versions, the backport typing_extensions is necessary:
-        from typing_extensions import Literal  # type: ignore[misc, assignment]
+try:
+    from typing import Literal, TypeAlias  # type: ignore[attr-defined]
+except ImportError:
+    # PEP 586 introduced typing.Literal with Python 3.8
+    # For older Python versions, the backport typing_extensions is necessary:
+    from typing_extensions import Literal, TypeAlias  # type: ignore[misc, assignment]
 
 
 def decompress(data: bytes) -> bytes:
@@ -158,8 +157,17 @@ def decode(
                 math.ceil(columns * bits_per_component / 8) + 1
             )  # number of bytes
 
+            # TIFF prediction:
+            if predictor == 2:
+                rowlength -= 1  # remove the predictor byte
+                bpp = rowlength // columns
+                str_data = bytearray(str_data)
+                for i in range(len(str_data)):
+                    if i % rowlength >= bpp:
+                        str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
+                str_data = bytes(str_data)
             # PNG prediction:
-            if 10 <= predictor <= 15:
+            elif 10 <= predictor <= 15:
                 str_data = FlateDecode._decode_png_prediction(str_data, columns, rowlength)  # type: ignore
             else:
                 # unsupported predictor
@@ -624,7 +632,10 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]:  # deprecated
     return decode_stream_data(stream)
 
 
-def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
+mode_str_type: TypeAlias = Literal["", "1", "RGB", "P", "L", "RGBA", "CMYK"]
+
+
+def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
     """
     Users need to have the pillow package installed.
 
@@ -635,7 +646,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
       x_object_obj:
 
     Returns:
-        Tuple[file extension, bytes]
+        Tuple[file extension, bytes, PIL.Image.Image]
     """
     try:
         from PIL import Image
@@ -735,4 +746,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
         img.save(img_byte_arr, format="PNG")
         data = img_byte_arr.getvalue()
 
-    return extension, data
+    try:  # temporary try/except until other fixes of images
+        img = Image.open(BytesIO(data))
+    except Exception:
+        img = None  # type: ignore
+    return extension, data, img
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -3,6 +3,7 @@
 import sys
 from io import BytesIO
 from itertools import product as cartesian_product
+from pathlib import Path
 from unittest.mock import patch
 
 import pytest
@@ -31,6 +32,10 @@
     string.whitespace,  # Add more...
 )
 
+TESTS_ROOT = Path(__file__).parent.resolve()
+PROJECT_ROOT = TESTS_ROOT.parent
+RESOURCE_ROOT = PROJECT_ROOT / "resources"
+
 
 @pytest.mark.parametrize(
     ("predictor", "s"), list(cartesian_product([1], filter_inputs))
@@ -250,7 +255,7 @@ def test_image_without_imagemagic():
 
         for page in reader.pages:
             with pytest.raises(ImportError) as exc:
-                page.images
+                page.images[0]
             assert exc.value.args[0] == (
                 "pillow is required to do image extraction. "
                 "It can be installed via 'pip install pypdf[image]'"

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -1134,3 +1134,29 @@ def test_pdf_pages_missing_type():
     reader.pages[0]
     writer = PdfWriter(clone_from=reader)
     writer.pages[0]
+
+
+@pytest.mark.enable_socket()
+def test_image_new_property():
+    url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf"
+    name = "pdf_font_garbled.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    reader.pages[0].images.keys()
+    # many tests disabled until other image fixes:
+    # reader.pages[0].images.items()
+    # reader.pages[0].images[0].name
+    reader.pages[0].images[-1].data
+    reader.pages[0].images["/TPL1", "/Image5"].image
+    # assert (
+    #    reader.pages[0].images["/I0"].indirect_reference.get_object()
+    #     == reader.pages[0]["/Resources"]["/XObject"]["/I0"]
+    # )
+    # list(reader.pages[0].images[0:2])
+    with pytest.raises(TypeError):
+        reader.pages[0].images[b"0"]
+    with pytest.raises(IndexError):
+        reader.pages[0].images[9999]
+    # just for test coverage:
+    with pytest.raises(KeyError):
+        reader.pages[0]._get_image(["test"], reader.pages[0])
+    assert list(PageObject(None, None).images) == []