Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add PageObject.images attribute #1330

Merged
merged 20 commits into from
Sep 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,18 @@
from ._cmap import build_char_map, unknown_char_map
from ._utils import (
CompressedTransformationMatrix,
File,
TransformationMatrixType,
deprecate_no_replacement,
deprecate_with_replacement,
logger_warning,
matrix_multiply,
)
from .constants import ImageAttributes as IA
from .constants import PageAttributes as PG
from .constants import Ressources as RES
from .errors import PageSizeNotDefinedError
from .filters import _xobj_to_image
from .generic import (
ArrayObject,
ContentStream,
Expand Down Expand Up @@ -365,6 +368,27 @@ def createBlankPage(
deprecate_with_replacement("createBlankPage", "create_blank_page")
return PageObject.create_blank_page(pdf, width, height)

@property
def images(self) -> List[File]:
"""
Get a list of all images of the page.

For the moment, this does NOT include inline images. They will be added
in future.
"""
images_extracted: List[File] = []
if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore
return images_extracted

x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = f"{obj[1:]}{extension}"
images_extracted.append(File(name=filename, data=byte_stream))
return images_extracted

@property
def rotation(self) -> int:
"""
Expand Down
7 changes: 7 additions & 0 deletions PyPDF2/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import logging
import warnings
from codecs import getencoder
from dataclasses import dataclass
from io import (
DEFAULT_BUFFER_SIZE,
BufferedReader,
Expand Down Expand Up @@ -413,3 +414,9 @@ def rename_kwargs( # type: ignore
f"{old_term} is deprecated as an argument. Use {new_term} instead"
)
)


@dataclass
class File:
name: str
data: bytes
18 changes: 12 additions & 6 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
extension = None
if SA.FILTER in x_object_obj:
if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
extension = ".png"
extension = ".png" # mime_type = "image/png"
color_space = None
if "/ColorSpace" in x_object_obj:
color_space = x_object_obj["/ColorSpace"].get_object()
Expand All @@ -606,16 +606,22 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
[FT.ASCII_85_DECODE],
[FT.CCITT_FAX_DECODE],
):
extension = ".png"
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
extension = ".tiff" # mime_type = "image/tiff"
else:
extension = ".png" # mime_type = "image/png"
data = b_(data)
elif x_object_obj[SA.FILTER] == FT.DCT_DECODE:
extension = ".jpg"
extension = ".jpg" # mime_type = "image/jpeg"
elif x_object_obj[SA.FILTER] == "/JPXDecode":
extension = ".jp2"
extension = ".jp2" # mime_type = "image/x-jp2"
elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE:
extension = ".tiff"
extension = ".tiff" # mime_type = "image/tiff"
else:
extension = ".png"
extension = ".png" # mime_type = "image/png"
img = Image.frombytes(mode, size, data)
img_byte_arr = BytesIO()
img.save(img_byte_arr, format="PNG")
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ You can contribute to `PyPDF2 on Github <https://github.com/py-pdf/PyPDF2>`_.
user/suppress-warnings
user/metadata
user/extract-text
user/extract-images
user/encryption-decryption
user/merging-pdfs
user/cropping-and-transforming
Expand Down
18 changes: 18 additions & 0 deletions docs/user/extract-images.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Extract Images

Every page of a PDF document can contain an arbitrary amount of images.
The names of the files may not be unique.

```python
from PyPDF2 import PdfReader

reader = PdfReader("example.pdf")

page = reader.pages[0]
count = 0

for image_file_object in page.images:
with open(str(count) + image_file_object.name, "wb") as fp:
fp.write(image_file_object.data)
count += 1
```
1 change: 1 addition & 0 deletions requirements/ci.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ pytest-benchmark
pycryptodome
typeguard
types-Pillow
types-dataclasses
2 changes: 2 additions & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ typed-ast==1.5.4
# via mypy
typeguard==2.13.3
# via -r requirements/ci.in
types-dataclasses==0.6.6
# via -r requirements/ci.in
types-pillow==9.2.1
# via -r requirements/ci.in
typing-extensions==4.1.1
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ packages =
python_requires = >=3.6
install_requires =
typing_extensions >= 3.10.0.0; python_version < '3.10'
dataclasses; python_version < '3.7'

[options.extras_require]
crypto = PyCryptodome
Expand Down
60 changes: 29 additions & 31 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,19 @@
from pathlib import Path

import pytest
from PIL import Image

from PyPDF2 import PdfReader
from PyPDF2._reader import convert_to_int, convertToInt
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import (
EmptyFileError,
FileNotDecryptedError,
PdfReadError,
PdfReadWarning,
WrongPasswordError,
)
from PyPDF2.filters import _xobj_to_image
from PyPDF2.generic import Destination

from . import get_pdf_from_url, normalize_warnings
Expand Down Expand Up @@ -166,45 +165,44 @@ def test_get_outline(src, outline_elements):


@pytest.mark.parametrize(
("src", "nb_images"),
("src", "expected_images"),
[
("pdflatex-outline.pdf", 0),
("crazyones.pdf", 0),
("git.pdf", 1),
("imagemagick-lzw.pdf", 1),
("imagemagick-ASCII85Decode.pdf", 1),
("imagemagick-CCITTFaxDecode.pdf", 1),
("pdflatex-outline.pdf", []),
("crazyones.pdf", []),
("git.pdf", ["Image9.png"]),
pytest.param(
"imagemagick-lzw.pdf",
["Im0.png"],
marks=pytest.mark.xfail(reason="broken image extraction"),
),
pytest.param(
"imagemagick-ASCII85Decode.pdf",
["Im0.png"],
marks=pytest.mark.xfail(reason="broken image extraction"),
),
("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]),
],
)
def test_get_images(src, nb_images):
src = RESOURCE_ROOT / src
reader = PdfReader(src)
def test_get_images(src, expected_images):
src_abs = RESOURCE_ROOT / src
reader = PdfReader(src_abs)

with pytest.raises(TypeError):
page = reader.pages["0"]

page = reader.pages[-1]
page = reader.pages[0]

images_extracted = []

if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = obj[1:] + ".png"
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)

assert len(images_extracted) == nb_images

# Cleanup
for filepath in images_extracted:
os.remove(filepath)
images_extracted = page.images
assert len(images_extracted) == len(expected_images)
for image, expected_image in zip(images_extracted, expected_images):
assert image.name == expected_image
with open(f"test-out-{src}-{image.name}", "wb") as fp:
fp.write(image.data)
assert (
image.name.split(".")[-1].upper()
== Image.open(io.BytesIO(image.data)).format
)


@pytest.mark.parametrize(
Expand Down
51 changes: 15 additions & 36 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@
import pytest

from PyPDF2 import PdfMerger, PdfReader, PdfWriter
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadWarning
from PyPDF2.filters import _xobj_to_image

from . import get_pdf_from_url, normalize_warnings

Expand Down Expand Up @@ -651,17 +648,11 @@ def test_image_extraction(url, name):
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand All @@ -684,17 +675,11 @@ def test_image_extraction_strict():
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as fp:
fp.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down Expand Up @@ -723,17 +708,11 @@ def test_image_extraction2(url, name):
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down