Skip to content

Commit

Permalink
ENH: Extend images interface by returning an ImageFile(File) class (#…
Browse files Browse the repository at this point in the history
…1848)

Extends the current image with new attributes (image/indirect_reference).
  • Loading branch information
pubpub-zz authored Jun 13, 2023
1 parent b128846 commit 4b6d864
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 17 deletions.
143 changes: 141 additions & 2 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from ._utils import (
CompressedTransformationMatrix,
File,
ImageFile,
TransformationMatrixType,
deprecation_no_replacement,
deprecation_with_replacement,
Expand Down Expand Up @@ -444,7 +445,7 @@ def createBlankPage(
return PageObject.create_blank_page(pdf, width, height)

@property
def images(self) -> List[File]:
def _old_images(self) -> List[File]: # deprecated
"""
Get a list of all images of the page.
Expand All @@ -460,12 +461,93 @@ def images(self) -> List[File]:
x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
extension, byte_stream, img = _xobj_to_image(x_object[obj])
if extension is not None:
filename = f"{obj[1:]}{extension}"
images_extracted.append(File(name=filename, data=byte_stream))
return images_extracted

def _get_ids_image(
self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
) -> List[Union[str, List[str]]]:
if obj is None:
obj = self
if ancest is None:
ancest = []
lst: List[Union[str, List[str]]] = []
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return lst

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(self._get_ids_image(x_object[o], ancest + [o]))
return lst

def _get_image(
self,
id: Union[str, List[str], Tuple[str]],
obj: Optional[DictionaryObject] = None,
) -> ImageFile:
if obj is None:
obj = cast(DictionaryObject, self)
if isinstance(id, tuple):
id = list(id)
if isinstance(id, List) and len(id) == 1:
id = id[0]
try:
xobjs = cast(
DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
)
except KeyError:
raise
if isinstance(id, str):
extension, byte_stream, img = _xobj_to_image(
cast(DictionaryObject, xobjs[id])
)
f = ImageFile(
name=f"{id[1:]}{extension}",
data=byte_stream,
image=img,
indirect_reference=xobjs[id].indirect_reference,
)
return f
else: # in a sub object
ids = id[1:]
return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

@property
def images(self) -> List[ImageFile]:
"""
Read-only property that emulates a list of files
Get a list of all images of the page.
the key can be:
µan str (for top object) or a tuple for image within XObject forms
or an int
ex:
```
reader.pages[0].images[0] # return fist image
reader.pages[0].images['/I0'] # return image '/I0'
reader.pages[0].images['/TP1','/Image1'] # return image '/Image1'
within '/TP1' Xobject/Form
for img in reader.pages[0].images: # loop within all objects
```
images.keys() and image.items() work
The File object properties are:
.name : name of the object
.data : bytes of the object
.image : PIL Image Object
.indirect_reference : object reference
"""
return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore

@property
def rotation(self) -> int:
"""
Expand Down Expand Up @@ -2242,3 +2324,60 @@ def _get_fonts_walk(
_get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb)

return fnt, emb # return the sets for each page


class _VirtualListImages(Sequence):
def __init__(
self,
ids_function: Callable[[], List[Union[str, List[str]]]],
get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile],
) -> None:
self.ids_function = ids_function
self.get_function = get_function
self.current = -1

def __len__(self) -> int:
return len(self.ids_function())

def keys(self) -> List[Union[str, List[str]]]:
return self.ids_function()

def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]:
return [(x, self[x]) for x in self.ids_function()]

@overload
def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile:
...

@overload
def __getitem__(self, index: slice) -> Sequence[ImageFile]:
...

def __getitem__(
self, index: Union[int, slice, str, List[str], Tuple[str]]
) -> Union[ImageFile, Sequence[ImageFile]]:
lst = self.ids_function()
if isinstance(index, slice):
indices = range(*index.indices(len(self)))
lst = [lst[x] for x in indices]
cls = type(self)
return cls((lambda: lst), self.get_function)
if isinstance(index, (str, list, tuple)):
return self.get_function(index)
if not isinstance(index, int):
raise TypeError("invalid sequence indices type")
len_self = len(lst)
if index < 0:
# support negative indexes
index = len_self + index
if index < 0 or index >= len_self:
raise IndexError("sequence index out of range")
return self.get_function(lst[index])

def __iter__(self) -> Iterator[ImageFile]:
for i in range(len(self)):
yield self[i]

def __str__(self) -> str:
p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
return f"[{', '.join(p)}]"
12 changes: 10 additions & 2 deletions pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,15 @@ class File:
data: bytes

def __str__(self) -> str:
return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"

def __repr__(self) -> str:
return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))}, hash: {hash(self.data)})"
return self.__str__()[:-1] + f", hash: {hash(self.data)})"


@dataclass
class ImageFile(File):
from .generic import IndirectObject

image: Optional[Any] = None # optional ; direct PIL image access
indirect_reference: Optional[IndirectObject] = None # optional ; link to PdfObject
39 changes: 27 additions & 12 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import struct
import zlib
from io import BytesIO
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union, cast
from typing import Any, Dict, Optional, Tuple, Union, cast

from ._utils import b_, deprecate_with_replacement, ord_, paeth_predictor
from .constants import CcittFaxDecodeParameters as CCITT
Expand All @@ -57,13 +57,12 @@
NullObject,
)

if TYPE_CHECKING:
try:
from typing import Literal # type: ignore[attr-defined]
except ImportError:
# PEP 586 introduced typing.Literal with Python 3.8
# For older Python versions, the backport typing_extensions is necessary:
from typing_extensions import Literal # type: ignore[misc, assignment]
try:
from typing import Literal, TypeAlias # type: ignore[attr-defined]
except ImportError:
# PEP 586 introduced typing.Literal with Python 3.8
# For older Python versions, the backport typing_extensions is necessary:
from typing_extensions import Literal, TypeAlias # type: ignore[misc, assignment]


def decompress(data: bytes) -> bytes:
Expand Down Expand Up @@ -158,8 +157,17 @@ def decode(
math.ceil(columns * bits_per_component / 8) + 1
) # number of bytes

# TIFF prediction:
if predictor == 2:
rowlength -= 1 # remove the predictor byte
bpp = rowlength // columns
str_data = bytearray(str_data)
for i in range(len(str_data)):
if i % rowlength >= bpp:
str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
str_data = bytes(str_data)
# PNG prediction:
if 10 <= predictor <= 15:
elif 10 <= predictor <= 15:
str_data = FlateDecode._decode_png_prediction(str_data, columns, rowlength) # type: ignore
else:
# unsupported predictor
Expand Down Expand Up @@ -624,7 +632,10 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated
return decode_stream_data(stream)


def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
mode_str_type: TypeAlias = Literal["", "1", "RGB", "P", "L", "RGBA", "CMYK"]


def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
"""
Users need to have the pillow package installed.
Expand All @@ -635,7 +646,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
x_object_obj:
Returns:
Tuple[file extension, bytes]
Tuple[file extension, bytes, PIL.Image.Image]
"""
try:
from PIL import Image
Expand Down Expand Up @@ -735,4 +746,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()

return extension, data
try: # temporary try/except until other fixes of images
img = Image.open(BytesIO(data))
except Exception:
img = None # type: ignore
return extension, data, img
7 changes: 6 additions & 1 deletion tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from io import BytesIO
from itertools import product as cartesian_product
from pathlib import Path
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -31,6 +32,10 @@
string.whitespace, # Add more...
)

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"


@pytest.mark.parametrize(
("predictor", "s"), list(cartesian_product([1], filter_inputs))
Expand Down Expand Up @@ -250,7 +255,7 @@ def test_image_without_imagemagic():

for page in reader.pages:
with pytest.raises(ImportError) as exc:
page.images
page.images[0]
assert exc.value.args[0] == (
"pillow is required to do image extraction. "
"It can be installed via 'pip install pypdf[image]'"
Expand Down
26 changes: 26 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,3 +1134,29 @@ def test_pdf_pages_missing_type():
reader.pages[0]
writer = PdfWriter(clone_from=reader)
writer.pages[0]


@pytest.mark.enable_socket()
def test_image_new_property():
url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf"
name = "pdf_font_garbled.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[0].images.keys()
# many tests disabled until other image fixes:
# reader.pages[0].images.items()
# reader.pages[0].images[0].name
reader.pages[0].images[-1].data
reader.pages[0].images["/TPL1", "/Image5"].image
# assert (
# reader.pages[0].images["/I0"].indirect_reference.get_object()
# == reader.pages[0]["/Resources"]["/XObject"]["/I0"]
# )
# list(reader.pages[0].images[0:2])
with pytest.raises(TypeError):
reader.pages[0].images[b"0"]
with pytest.raises(IndexError):
reader.pages[0].images[9999]
# just for test coverage:
with pytest.raises(KeyError):
reader.pages[0]._get_image(["test"], reader.pages[0])
assert list(PageObject(None, None).images) == []

0 comments on commit 4b6d864

Please sign in to comment.