From e3c24873e19eaa69be527052d146e12f640a6659 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 6 Jul 2022 22:41:31 +0200 Subject: [PATCH 1/5] BUG: Byte math errors for decoding bitmap PNGs Closes #535 Closes #536 Co-authored-by: Christopher Egner --- PyPDF2/filters.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 077143813..8b78af297 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -108,36 +108,49 @@ def decode( if predictor != 1: # The /Columns param. has 1 as the default value; see ISO 32000, # ยง7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8 + DEFAULT_BITS_PER_COMPONENT = 8 if isinstance(decode_parms, ArrayObject): columns = 1 + bits_per_component = DEFAULT_BITS_PER_COMPONENT for decode_parm in decode_parms: if "/Columns" in decode_parm: columns = decode_parm["/Columns"] + if LZW.BITS_PER_COMPONENT in decode_parm: + bits_per_component = decode_parm[LZW.BITS_PER_COMPONENT] else: columns = ( 1 if decode_parms is None else decode_parms.get(LZW.COLUMNS, 1) ) + bits_per_component = decode_parms.get( + LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT + ) + + # PNG predictor can vary by row and so is the lead byte on each row + rowlength = ( + math.ceil(columns * bits_per_component / 8) + 1 + ) # number of bytes # PNG prediction: if 10 <= predictor <= 15: - str_data = FlateDecode._decode_png_prediction(str_data, columns) # type: ignore + str_data = FlateDecode._decode_png_prediction(str_data, columns, rowlength) # type: ignore else: # unsupported predictor raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}") return str_data @staticmethod - def _decode_png_prediction(data: str, columns: int) -> str: - output = StringIO() + def _decode_png_prediction(data: str, columns: int, rowlength: int) -> bytes: + output = BytesIO() # PNG prediction can vary from row to row - rowlength = columns + 1 - assert len(data) % rowlength == 0 + if len(data) % rowlength != 0: + raise PdfReadError("Image data is not rectangular") prev_rowdata = (0,) * rowlength for row in range(len(data) // rowlength): rowdata = [ ord_(x) for x in data[(row * rowlength) : ((row + 1) * rowlength)] ] filter_byte = rowdata[0] + if filter_byte == 0: pass elif filter_byte == 1: @@ -162,7 +175,7 @@ def _decode_png_prediction(data: str, columns: int) -> str: # unsupported PNG filter raise PdfReadError(f"Unsupported PNG filter {filter_byte!r}") prev_rowdata = tuple(rowdata) - output.write("".join([chr(x) for x in rowdata[1:]])) + output.write(bytearray(rowdata[1:])) return output.getvalue() @staticmethod From 9bf83da6e47b3e534279c5ff18c3caf6e3c7092b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 7 Jul 2022 06:32:34 +0200 Subject: [PATCH 2/5] StringIO is no longer needed --- PyPDF2/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 8b78af297..6dbde7093 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -37,7 +37,7 @@ import math import struct import zlib -from io import BytesIO, StringIO +from io import BytesIO from typing import Any, Dict, Optional, Tuple, Union from .generic import ArrayObject, DictionaryObject, NameObject From 5942dfa4aecea37285c2340067e31f5a3a21bd7f Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 7 Jul 2022 07:14:39 +0200 Subject: [PATCH 3/5] Catch none --- PyPDF2/filters.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 6dbde7093..00111ccd9 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -121,8 +121,10 @@ def decode( columns = ( 1 if decode_parms is None else decode_parms.get(LZW.COLUMNS, 1) ) - bits_per_component = decode_parms.get( - LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT + bits_per_component = ( + decode_parms.get(LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT) + if decode_parms + else DEFAULT_BITS_PER_COMPONENT ) # PNG predictor can vary by row and so is the lead byte on each row From 71ce1208074bfd1be03aaa031b8762338e813da0 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 9 Jul 2022 10:04:41 +0200 Subject: [PATCH 4/5] Misc --- .gitignore | 1 + PyPDF2/filters.py | 13 +++++++++---- tests/test_reader.py | 1 - tests/test_workflows.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index f4ec295c5..97f93ad19 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,4 @@ PyPDF2_pdfLocation.txt .python-version tests/pdf_cache/ docs/meta/CHANGELOG.md +extracted-images/ diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 00111ccd9..a5ad61ff9 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -425,7 +425,7 @@ def _get_parameters( parameters: Union[None, ArrayObject, DictionaryObject], rows: int ) -> CCITParameters: k = 0 - columns = 0 + columns = 1728 # TABLE 3.9 Optional parameters for the CCITTFaxDecode filter if parameters: if isinstance(parameters, ArrayObject): for decode_parm in parameters: @@ -434,8 +434,10 @@ def _get_parameters( if CCITT.K in decode_parm: k = decode_parm[CCITT.K] else: - columns = parameters[CCITT.COLUMNS] # type: ignore - k = parameters[CCITT.K] # type: ignore + if CCITT.COLUMNS in parameters: + columns = parameters[CCITT.COLUMNS] # type: ignore + if CCITT.K in parameters: + k = parameters[CCITT.K] # type: ignore return CCITParameters(k, columns, rows) @@ -556,7 +558,10 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) data = x_object_obj.get_data() # type: ignore - if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB: + if ( + IA.COLOR_SPACE in x_object_obj + and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB + ): mode: Literal["RGB", "P"] = "RGB" else: mode = "P" diff --git a/tests/test_reader.py b/tests/test_reader.py index 13c52c1ea..12378fdef 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -705,7 +705,6 @@ def test_read_not_binary_mode(): PdfReader(f) -@pytest.mark.xfail(reason="#416") def test_read_form_416(): url = ( "https://www.fda.gov/downloads/AboutFDA/ReportsManualsForms/Forms/UCM074728.pdf" diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 28d52552c..601a13041 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -2,12 +2,16 @@ import os import sys from io import BytesIO +from pathlib import Path import pytest from PyPDF2 import PdfMerger, PdfReader, PdfWriter +from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG +from PyPDF2.constants import Ressources as RES from PyPDF2.errors import PdfReadError, PdfReadWarning +from PyPDF2.filters import _xobj_to_image from . import get_pdf_from_url @@ -372,3 +376,34 @@ def test_merge_output(): # Cleanup merger.close() + + +def test_image_extraction(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994636.pdf" + name = "tika-994636.pdf" + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + + images_extracted = [] + root = Path("extracted-images") + if not root.exists(): + os.mkdir(root) + + for page in reader.pages: + if RES.XOBJECT in page[PG.RESOURCES]: + x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() + + for obj in x_object: + if x_object[obj][IA.SUBTYPE] == "/Image": + extension, byte_stream = _xobj_to_image(x_object[obj]) + if extension is not None: + filename = root / (obj[1:] + extension) + with open(filename, "wb") as img: + img.write(byte_stream) + images_extracted.append(filename) + + # Cleanup + return + for filepath in images_extracted: + if os.path.exists(filepath): + os.remove(filepath) From 408bff58955d191e5e22b89452a94b7da2f7b414 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 9 Jul 2022 11:20:55 +0200 Subject: [PATCH 5/5] Add Indexed color space support --- PyPDF2/filters.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 7aaca877b..543844454 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -570,7 +570,21 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: if SA.FILTER in x_object_obj: if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: extension = ".png" + color_space = None + if "/ColorSpace" in x_object_obj: + color_space = x_object_obj["/ColorSpace"].get_object() + if ( + isinstance(color_space, ArrayObject) + and color_space[0] == "/Indexed" + ): + color_space, base, hival, lookup = ( + value.get_object() for value in color_space + ) + img = Image.frombytes(mode, size, data) + if color_space == "/Indexed": + img.putpalette(lookup.get_data()) + img = img.convert("RGB") if G.S_MASK in x_object_obj: # add alpha channel alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data()) img.putalpha(alpha)