From 5d1d71cf99478ddf56ccdcf3e24770d4b94aaa17 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 7 Sep 2022 21:30:24 +0200 Subject: [PATCH 01/19] ENH: Add PageObject.images attribute --- PyPDF2/_page.py | 18 ++++++++++++++++++ PyPDF2/_utils.py | 7 +++++++ setup.cfg | 1 + tests/test_reader.py | 19 +------------------ 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 682db1151..08f8848a4 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -48,15 +48,18 @@ from ._cmap import build_char_map, unknown_char_map from ._utils import ( CompressedTransformationMatrix, + File, TransformationMatrixType, deprecate_no_replacement, deprecate_with_replacement, logger_warning, matrix_multiply, ) +from .constants import ImageAttributes as IA from .constants import PageAttributes as PG from .constants import Ressources as RES from .errors import PageSizeNotDefinedError +from .filters import _xobj_to_image from .generic import ( ArrayObject, ContentStream, @@ -345,6 +348,21 @@ def createBlankPage( deprecate_with_replacement("createBlankPage", "create_blank_page") return PageObject.create_blank_page(pdf, width, height) + @property + def images(self) -> List[File]: + images_extracted = [] + if RES.XOBJECT not in self[PG.RESOURCES]: + return images_extracted + + x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() + for obj in x_object: + if x_object[obj][IA.SUBTYPE] == "/Image": + extension, byte_stream = _xobj_to_image(x_object[obj]) + if extension is not None: + filename = obj[1:] + ".png" # TODO + images_extracted.append(File(name=filename, data=byte_stream)) + return images_extracted + def rotate(self, angle: int) -> "PageObject": """ Rotate a page clockwise by increments of 90 degrees. diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index eeceda1b4..3aa8986bb 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -33,6 +33,7 @@ import logging import warnings from codecs import getencoder +from dataclasses import dataclass from io import ( DEFAULT_BUFFER_SIZE, BufferedReader, @@ -413,3 +414,9 @@ def rename_kwargs( # type: ignore f"{old_term} is deprecated as an argument. Use {new_term} instead" ) ) + + +@dataclass +class File: + name: str + data: bytes diff --git a/setup.cfg b/setup.cfg index 2c0eebe8f..e3fa3556d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,6 +39,7 @@ packages = python_requires = >=3.6 install_requires = typing_extensions >= 3.10.0.0; python_version < '3.10' + dataclasses; python_version < '3.7' [options.extras_require] crypto = PyCryptodome diff --git a/tests/test_reader.py b/tests/test_reader.py index 12adb4b79..ce673ad21 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -186,26 +186,9 @@ def test_get_images(src, nb_images): page = reader.pages[-1] page = reader.pages[0] - images_extracted = [] - - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = obj[1:] + ".png" - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) - + images_extracted = page.images assert len(images_extracted) == nb_images - # Cleanup - for filepath in images_extracted: - os.remove(filepath) - @pytest.mark.parametrize( ("strict", "with_prev_0", "startx_correction", "should_fail", "warning_msgs"), From 55669e31d149d3e3be22ff60fe800bd9d95d5a8c Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 7 Sep 2022 21:38:45 +0200 Subject: [PATCH 02/19] Add docs --- docs/index.rst | 1 + docs/user/extract-images.md | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 docs/user/extract-images.md diff --git a/docs/index.rst b/docs/index.rst index 016e10fe9..a2f7b044b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ You can contribute to `PyPDF2 on Github `_. user/suppress-warnings user/metadata user/extract-text + user/extract-images user/encryption-decryption user/merging-pdfs user/cropping-and-transforming diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md new file mode 100644 index 000000000..adb338b44 --- /dev/null +++ b/docs/user/extract-images.md @@ -0,0 +1,13 @@ +# Extract Images + +```python +from PyPDF2 import PdfReader + +reader = PdfReader("example.pdf") + +page = reader.pages[0] + +for image_file_object in page.images: + with open(image_file_object, "wb") as fp: + fp.write(image_file_object.data) +``` From 8cb5c9893b72024815a0d28ae7eef39adb1474eb Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 8 Sep 2022 13:13:21 +0200 Subject: [PATCH 03/19] Add docs --- PyPDF2/_utils.py | 6 ++++++ docs/user/extract-images.md | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 3aa8986bb..0e69eeccb 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -420,3 +420,9 @@ def rename_kwargs( # type: ignore class File: name: str data: bytes + mime_type: str + + @property + def file_extension(self) -> str: + if self.mime_type == "image/png": + return "png" diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md index adb338b44..5c07bfc44 100644 --- a/docs/user/extract-images.md +++ b/docs/user/extract-images.md @@ -1,13 +1,18 @@ # Extract Images +Every page of a PDF document can contain an arbitrary amount of images. +The names of the files may not be unique. + ```python from PyPDF2 import PdfReader reader = PdfReader("example.pdf") page = reader.pages[0] +count = 0 for image_file_object in page.images: - with open(image_file_object, "wb") as fp: + with open(str(count) + image_file_object.name, "wb") as fp: fp.write(image_file_object.data) + count += 1 ``` From 85a67e5d7af7d6818c531253b9247caacf285b28 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 10 Sep 2022 22:05:41 +0200 Subject: [PATCH 04/19] fix flake8 --- tests/test_reader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index ce673ad21..0a45c54e9 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -10,7 +10,6 @@ from PyPDF2._reader import convert_to_int, convertToInt from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG -from PyPDF2.constants import Ressources as RES from PyPDF2.errors import ( EmptyFileError, FileNotDecryptedError, @@ -18,7 +17,6 @@ PdfReadWarning, WrongPasswordError, ) -from PyPDF2.filters import _xobj_to_image from PyPDF2.generic import Destination from . import get_pdf_from_url, normalize_warnings From 4aed7eecf860e2ba9f033deae557506be577e111 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 07:20:22 +0200 Subject: [PATCH 05/19] Add mime types --- PyPDF2/_page.py | 4 +++- PyPDF2/_utils.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 08f8848a4..165e9c4c8 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -360,7 +360,9 @@ def images(self) -> List[File]: extension, byte_stream = _xobj_to_image(x_object[obj]) if extension is not None: filename = obj[1:] + ".png" # TODO - images_extracted.append(File(name=filename, data=byte_stream)) + images_extracted.append( + File(name=filename, data=byte_stream, mime_type="image/png") + ) return images_extracted def rotate(self, angle: int) -> "PageObject": diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 0e69eeccb..2f3382c58 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -426,3 +426,8 @@ class File: def file_extension(self) -> str: if self.mime_type == "image/png": return "png" + if self.mime_type == "image/jpeg": + return "jpg" + if self.mime_type == "image/gif": + return "gif" + return "unknown" From 82e4796366a3fead74bf362b1e85481f89bee7e9 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 07:58:40 +0200 Subject: [PATCH 06/19] Add more mime types --- PyPDF2/_page.py | 8 ++++---- PyPDF2/_utils.py | 19 ++++++++++++------- PyPDF2/filters.py | 22 +++++++++++++--------- tests/test_reader.py | 34 +++++++++++++++++++++++----------- 4 files changed, 52 insertions(+), 31 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 165e9c4c8..4baa96d5d 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -357,11 +357,11 @@ def images(self) -> List[File]: x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() for obj in x_object: if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = obj[1:] + ".png" # TODO + mime_type, byte_stream = _xobj_to_image(x_object[obj]) + if mime_type is not None: + filename = f"{obj[1:]}.{File._mime2extension(mime_type)}" images_extracted.append( - File(name=filename, data=byte_stream, mime_type="image/png") + File(name=filename, data=byte_stream, mime_type=mime_type) ) return images_extracted diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 2f3382c58..9ae8c29eb 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -424,10 +424,15 @@ class File: @property def file_extension(self) -> str: - if self.mime_type == "image/png": - return "png" - if self.mime_type == "image/jpeg": - return "jpg" - if self.mime_type == "image/gif": - return "gif" - return "unknown" + return File._mime2extension(self.mime_type) + + @staticmethod + def _mime2extension(mime_type: str) -> str: + mapping = { + "image/png": "png", + "image/jpeg": "jpg", + "image/x-jp2": "jp2", + "image/gif": "gif", + "image/tiff": "tiff", + } + return mapping.get(mime_type, "unknown") diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 4ac651b39..d0b97b451 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -562,7 +562,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: It's unclear if PyPDF2 will keep this function here, hence it's private. It might get removed at any point. - :return: Tuple[file extension, bytes] + :return: Tuple[mime type, bytes] """ from PIL import Image @@ -576,10 +576,10 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: mode: Literal["RGB", "P"] = "RGB" else: mode = "P" - extension = None + mime_type = None if SA.FILTER in x_object_obj: if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: - extension = ".png" + mime_type = "image/png" color_space = None if "/ColorSpace" in x_object_obj: color_space = x_object_obj["/ColorSpace"].get_object() @@ -606,19 +606,23 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE], ): - extension = ".png" + # I'm not sure if the mime types have any relationship to the filters + if x_object_obj[SA.FILTER] == FT.LZW_DECODE: + mime_type = "image/tiff" + else: + mime_type = "image/png" data = b_(data) elif x_object_obj[SA.FILTER] == FT.DCT_DECODE: - extension = ".jpg" + mime_type = "image/jpeg" elif x_object_obj[SA.FILTER] == "/JPXDecode": - extension = ".jp2" + mime_type = "image/x-jp2" elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE: - extension = ".tiff" + mime_type = "image/tiff" else: - extension = ".png" + mime_type = "image/png" img = Image.frombytes(mode, size, data) img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() - return extension, data + return mime_type, data diff --git a/tests/test_reader.py b/tests/test_reader.py index 0a45c54e9..2428e6c26 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -5,6 +5,7 @@ from pathlib import Path import pytest +from PIL import Image from PyPDF2 import PdfReader from PyPDF2._reader import convert_to_int, convertToInt @@ -164,19 +165,22 @@ def test_get_outline(src, outline_elements): @pytest.mark.parametrize( - ("src", "nb_images"), + ("src", "expected_images"), [ - ("pdflatex-outline.pdf", 0), - ("crazyones.pdf", 0), - ("git.pdf", 1), - ("imagemagick-lzw.pdf", 1), - ("imagemagick-ASCII85Decode.pdf", 1), - ("imagemagick-CCITTFaxDecode.pdf", 1), + ("pdflatex-outline.pdf", []), + ("crazyones.pdf", []), + ("git.pdf", [("Image9.png", "image/png")]), + ("imagemagick-lzw.pdf", [("Im0.png", "unknown")]), # Broken extraction + ( + "imagemagick-ASCII85Decode.pdf", + [("Im0.png", "unknown")], + ), # Broken extraction + ("imagemagick-CCITTFaxDecode.pdf", [("Im0.png", "image/tiff")]), ], ) -def test_get_images(src, nb_images): - src = RESOURCE_ROOT / src - reader = PdfReader(src) +def test_get_images(src, expected_images): + src_abs = RESOURCE_ROOT / src + reader = PdfReader(src_abs) with pytest.raises(TypeError): page = reader.pages["0"] @@ -185,7 +189,15 @@ def test_get_images(src, nb_images): page = reader.pages[0] images_extracted = page.images - assert len(images_extracted) == nb_images + assert len(images_extracted) == len(expected_images) + for image, (expected_image, expected_mime) in zip( + images_extracted, expected_images + ): + assert image.name == expected_image + with open(f"test-out-{src}-{image.name}", "wb") as fp: + fp.write(image.data) + assert image.file_extension.upper() == Image.open(io.BytesIO(image.data)).format + assert image.mime_type == expected_mime @pytest.mark.parametrize( From ad19cc3187ec21d33a05cd99562be0dcbf13becb Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 08:43:06 +0200 Subject: [PATCH 07/19] Fix image extraction --- tests/test_reader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 2428e6c26..15b64eb17 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -170,11 +170,16 @@ def test_get_outline(src, outline_elements): ("pdflatex-outline.pdf", []), ("crazyones.pdf", []), ("git.pdf", [("Image9.png", "image/png")]), - ("imagemagick-lzw.pdf", [("Im0.png", "unknown")]), # Broken extraction - ( + pytest.param( + "imagemagick-lzw.pdf", + [("Im0.png", "unknown")], + marks=pytest.mark.xfail(reason="broken image extraction"), + ), + pytest.param( "imagemagick-ASCII85Decode.pdf", [("Im0.png", "unknown")], - ), # Broken extraction + marks=pytest.mark.xfail(reason="broken image extraction"), + ), ("imagemagick-CCITTFaxDecode.pdf", [("Im0.png", "image/tiff")]), ], ) From bb7185b15621fe0daba117752e935d4656961bdc Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 19:02:42 +0200 Subject: [PATCH 08/19] Update workflows --- tests/test_workflows.py | 48 +++++++++++++---------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 57cab7018..046949aa9 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -687,17 +687,11 @@ def test_image_extraction(url, name): os.mkdir(root) for page in reader.pages: - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = root / (obj[1:] + extension) - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) + for image in page.images: + filename = root / image.name + with open(filename, "wb") as img: + img.write(image.data) + images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection @@ -720,17 +714,11 @@ def test_image_extraction_strict(): os.mkdir(root) for page in reader.pages: - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = root / (obj[1:] + extension) - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) + for image in page.images: + filename = root / image.name + with open(filename) as fp: + fp.write(image.data) + images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection @@ -759,17 +747,11 @@ def test_image_extraction2(url, name): os.mkdir(root) for page in reader.pages: - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = root / (obj[1:] + extension) - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) + for image in page.images: + filename = root / image.name + with open(filename, "wb") as img: + img.write(image.data) + images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection From fe7a965c0b1127a5dbabefe8d21594c854ba21ee Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 19:34:17 +0200 Subject: [PATCH 09/19] Flake8 fix --- tests/test_workflows.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 046949aa9..6bdcb6da5 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -14,11 +14,8 @@ import pytest from PyPDF2 import PdfMerger, PdfReader, PdfWriter -from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG -from PyPDF2.constants import Ressources as RES from PyPDF2.errors import PdfReadWarning -from PyPDF2.filters import _xobj_to_image from . import get_pdf_from_url, normalize_warnings From c18d2a6daccd8d9c014000add52f0814fb9c8a44 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 19:43:38 +0200 Subject: [PATCH 10/19] Update --- tests/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 6bdcb6da5..4ac9814f4 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -713,7 +713,7 @@ def test_image_extraction_strict(): for page in reader.pages: for image in page.images: filename = root / image.name - with open(filename) as fp: + with open(filename, "wb") as fp: fp.write(image.data) images_extracted.append(filename) From 4d8ac662ed86f2f75515f967bc5224d8623014c6 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 21:45:21 +0200 Subject: [PATCH 11/19] mime type --- PyPDF2/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index d0b97b451..d241309d0 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -607,7 +607,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: [FT.CCITT_FAX_DECODE], ): # I'm not sure if the mime types have any relationship to the filters - if x_object_obj[SA.FILTER] == FT.LZW_DECODE: + if x_object_obj[SA.FILTER] in [FT.LZW_DECODE, FT.CCITT_FAX_DECODE]: mime_type = "image/tiff" else: mime_type = "image/png" From 53bd98c699213467bbf043cc89036021269662d9 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 21:54:54 +0200 Subject: [PATCH 12/19] Fix --- PyPDF2/filters.py | 2 +- tests/test_reader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index d241309d0..bef01b8d7 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -607,7 +607,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: [FT.CCITT_FAX_DECODE], ): # I'm not sure if the mime types have any relationship to the filters - if x_object_obj[SA.FILTER] in [FT.LZW_DECODE, FT.CCITT_FAX_DECODE]: + if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]: mime_type = "image/tiff" else: mime_type = "image/png" diff --git a/tests/test_reader.py b/tests/test_reader.py index 15b64eb17..67b421dce 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -180,7 +180,7 @@ def test_get_outline(src, outline_elements): [("Im0.png", "unknown")], marks=pytest.mark.xfail(reason="broken image extraction"), ), - ("imagemagick-CCITTFaxDecode.pdf", [("Im0.png", "image/tiff")]), + ("imagemagick-CCITTFaxDecode.pdf", [("Im0.tiff", "image/tiff")]), ], ) def test_get_images(src, expected_images): From b0cf635787512592e07830227b5a1b6630c7776c Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 22:05:07 +0200 Subject: [PATCH 13/19] mypy --- PyPDF2/_page.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 4baa96d5d..094a3e7fa 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -350,11 +350,11 @@ def createBlankPage( @property def images(self) -> List[File]: - images_extracted = [] - if RES.XOBJECT not in self[PG.RESOURCES]: + images_extracted: List[File] = [] + if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore return images_extracted - x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() + x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for obj in x_object: if x_object[obj][IA.SUBTYPE] == "/Image": mime_type, byte_stream = _xobj_to_image(x_object[obj]) From f1e7b84fac16931415f966f75bb86e5d2adef91b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 15 Sep 2022 22:16:00 +0200 Subject: [PATCH 14/19] fix imports --- requirements/ci.in | 1 + requirements/ci.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/requirements/ci.in b/requirements/ci.in index 0527a1f05..aa27ae1c1 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -10,3 +10,4 @@ pytest-benchmark pycryptodome typeguard types-Pillow +types-dataclasses diff --git a/requirements/ci.txt b/requirements/ci.txt index bf8372cb2..ab2537fb4 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -73,6 +73,8 @@ typed-ast==1.5.4 # via mypy typeguard==2.13.3 # via -r requirements/ci.in +types-dataclasses==0.6.6 + # via -r requirements/ci.in types-pillow==9.2.1 # via -r requirements/ci.in typing-extensions==4.1.1 From 30fbf417aa840ffe7400b733dc4f64b8e4de83a4 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 18 Sep 2022 11:24:55 +0200 Subject: [PATCH 15/19] Rename 'file_extension' to 'format' This is consistent with Pillow: https://pillow.readthedocs.io/en/latest/reference/Image.html#PIL.Image.Image.format Co-authored-by: Matthew Peveler --- PyPDF2/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 9ae8c29eb..57d575328 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -423,7 +423,7 @@ class File: mime_type: str @property - def file_extension(self) -> str: + def format(self) -> str: return File._mime2extension(self.mime_type) @staticmethod From 4b77a6aa31f18897cc16f2ceb07b5e1311b74176 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 18 Sep 2022 11:25:43 +0200 Subject: [PATCH 16/19] Format rename --- tests/test_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 67b421dce..234e75c1d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -201,7 +201,7 @@ def test_get_images(src, expected_images): assert image.name == expected_image with open(f"test-out-{src}-{image.name}", "wb") as fp: fp.write(image.data) - assert image.file_extension.upper() == Image.open(io.BytesIO(image.data)).format + assert image.format.upper() == Image.open(io.BytesIO(image.data)).format assert image.mime_type == expected_mime From c4918817fdf7b7c7e7c59cb76d90449e015cc769 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 24 Sep 2022 07:22:38 +0200 Subject: [PATCH 17/19] Remove mime type --- PyPDF2/_page.py | 10 ++++------ PyPDF2/_utils.py | 16 ---------------- PyPDF2/filters.py | 24 +++++++++++++----------- tests/test_reader.py | 6 ++++-- 4 files changed, 21 insertions(+), 35 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index bc668b562..218161d95 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -377,12 +377,10 @@ def images(self) -> List[File]: x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for obj in x_object: if x_object[obj][IA.SUBTYPE] == "/Image": - mime_type, byte_stream = _xobj_to_image(x_object[obj]) - if mime_type is not None: - filename = f"{obj[1:]}.{File._mime2extension(mime_type)}" - images_extracted.append( - File(name=filename, data=byte_stream, mime_type=mime_type) - ) + extension, byte_stream = _xobj_to_image(x_object[obj]) + if extension is not None: + filename = f"{obj[1:]}{extension}" + images_extracted.append(File(name=filename, data=byte_stream)) return images_extracted @property diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 57d575328..3aa8986bb 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -420,19 +420,3 @@ def rename_kwargs( # type: ignore class File: name: str data: bytes - mime_type: str - - @property - def format(self) -> str: - return File._mime2extension(self.mime_type) - - @staticmethod - def _mime2extension(mime_type: str) -> str: - mapping = { - "image/png": "png", - "image/jpeg": "jpg", - "image/x-jp2": "jp2", - "image/gif": "gif", - "image/tiff": "tiff", - } - return mapping.get(mime_type, "unknown") diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index bef01b8d7..de7ea8433 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -562,7 +562,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: It's unclear if PyPDF2 will keep this function here, hence it's private. It might get removed at any point. - :return: Tuple[mime type, bytes] + :return: Tuple[file extension, bytes] """ from PIL import Image @@ -576,10 +576,10 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: mode: Literal["RGB", "P"] = "RGB" else: mode = "P" - mime_type = None + extension = None if SA.FILTER in x_object_obj: if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: - mime_type = "image/png" + extension = ".png" # mime_type = "image/png" color_space = None if "/ColorSpace" in x_object_obj: color_space = x_object_obj["/ColorSpace"].get_object() @@ -606,23 +606,25 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE], ): - # I'm not sure if the mime types have any relationship to the filters + # I'm not sure if the following logic is correct. + # There might not be any relationship between the filters and the + # extension if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]: - mime_type = "image/tiff" + extension = ".tiff" # mime_type = "image/tiff" else: - mime_type = "image/png" + extension = ".png" # mime_type = "image/png" data = b_(data) elif x_object_obj[SA.FILTER] == FT.DCT_DECODE: - mime_type = "image/jpeg" + extension = ".jpg" # mime_type = "image/jpeg" elif x_object_obj[SA.FILTER] == "/JPXDecode": - mime_type = "image/x-jp2" + extension = ".jp2" # mime_type = "image/x-jp2" elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE: - mime_type = "image/tiff" + extension = ".tiff" # mime_type = "image/tiff" else: - mime_type = "image/png" + extension = ".png" # mime_type = "image/png" img = Image.frombytes(mode, size, data) img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() - return mime_type, data + return extension, data diff --git a/tests/test_reader.py b/tests/test_reader.py index 234e75c1d..58d7213a5 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -201,8 +201,10 @@ def test_get_images(src, expected_images): assert image.name == expected_image with open(f"test-out-{src}-{image.name}", "wb") as fp: fp.write(image.data) - assert image.format.upper() == Image.open(io.BytesIO(image.data)).format - assert image.mime_type == expected_mime + assert ( + image.name.split(".")[-1].upper() + == Image.open(io.BytesIO(image.data)).format + ) @pytest.mark.parametrize( From 44efe789dc6a201ba8c964693117dc24b322d23e Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 24 Sep 2022 07:27:45 +0200 Subject: [PATCH 18/19] Add docstring mentioning inline images --- PyPDF2/_page.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 218161d95..76bde1188 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -370,6 +370,12 @@ def createBlankPage( @property def images(self) -> List[File]: + """ + Get a list of all images of the page. + + For the moment, this does NOT include inline images. They will be added + in future. + """ images_extracted: List[File] = [] if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore return images_extracted From 50447affac9e727555557a461835c8428b7a1dda Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 24 Sep 2022 07:29:44 +0200 Subject: [PATCH 19/19] Fix test --- tests/test_reader.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 58d7213a5..693060a50 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -169,18 +169,18 @@ def test_get_outline(src, outline_elements): [ ("pdflatex-outline.pdf", []), ("crazyones.pdf", []), - ("git.pdf", [("Image9.png", "image/png")]), + ("git.pdf", ["Image9.png"]), pytest.param( "imagemagick-lzw.pdf", - [("Im0.png", "unknown")], + ["Im0.png"], marks=pytest.mark.xfail(reason="broken image extraction"), ), pytest.param( "imagemagick-ASCII85Decode.pdf", - [("Im0.png", "unknown")], + ["Im0.png"], marks=pytest.mark.xfail(reason="broken image extraction"), ), - ("imagemagick-CCITTFaxDecode.pdf", [("Im0.tiff", "image/tiff")]), + ("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]), ], ) def test_get_images(src, expected_images): @@ -195,9 +195,7 @@ def test_get_images(src, expected_images): images_extracted = page.images assert len(images_extracted) == len(expected_images) - for image, (expected_image, expected_mime) in zip( - images_extracted, expected_images - ): + for image, expected_image in zip(images_extracted, expected_images): assert image.name == expected_image with open(f"test-out-{src}-{image.name}", "wb") as fp: fp.write(image.data)