From 7621d8e6d6c926d1961a66af6255aa02f5f6c1cb Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 7 Dec 2023 09:49:32 +0100 Subject: [PATCH 1/4] BUG: Relax flate decoding for too many lookup values --- pypdf/_xobj_image_helpers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 515c01ebe..577979db3 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -195,7 +195,14 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: else: if img.mode == "1": # Two values ("high" and "low"). - assert len(lookup) == 2 * nb, len(lookup) + if len(lookup) != 2 * nb: + if len(lookup) < 2 * nb: + raise PdfReadError(f"Not enough lookup values: Expected {2 * nb}, got {len(lookup)}.") + logger_warning( + f"Expected {2 * nb} lookup values, got {len(lookup)}. Ignoring trailing ones.", + __name__, + ) + lookup = lookup[:2 * nb] colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ From 676abc891069c1f550ae037e9c8886efba8fee2c Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:52:35 +0100 Subject: [PATCH 2/4] only accept whitespace characters as trailing ones --- pypdf/_xobj_image_helpers.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 577979db3..30b69aa9d 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import logger_warning +from ._utils import logger_warning, WHITESPACES from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( @@ -195,14 +195,13 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: else: if img.mode == "1": # Two values ("high" and "low"). - if len(lookup) != 2 * nb: - if len(lookup) < 2 * nb: - raise PdfReadError(f"Not enough lookup values: Expected {2 * nb}, got {len(lookup)}.") - logger_warning( - f"Expected {2 * nb} lookup values, got {len(lookup)}. Ignoring trailing ones.", - __name__, - ) - lookup = lookup[:2 * nb] + expected_count = 2 * nb + if len(lookup) != expected_count: + if len(lookup) < expected_count: + raise PdfReadError(f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}.") + lookup = lookup[:expected_count] + if not all(_value in WHITESPACES for _value in lookup[expected_count:]): + raise PdfReadError(f"Too many lookup values: Expected {expected_count}, got {len(lookup)}.") colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ From c34e9fc5f68ff65d2aa941c377a78ecce3666762 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:55:21 +0100 Subject: [PATCH 3/4] fix import sort oder --- pypdf/_xobj_image_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 30b69aa9d..a390357dd 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import logger_warning, WHITESPACES +from ._utils import WHITESPACES, logger_warning from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( From 73308dc17659d02f4b0ea25c5a8c3204c6960db2 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:58:16 +0100 Subject: [PATCH 4/4] add test with trailing newline --- tests/test_filters.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_filters.py b/tests/test_filters.py index 00a548ab0..72d6f76cc 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -649,3 +649,12 @@ def test_flate_decode_with_image_mode_1(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for image in reader.pages[7].images: _ = image + + +@pytest.mark.enable_socket() +def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): + """From #2331""" + url = "https://github.com/py-pdf/pypdf/files/13611048/out1.pdf" + name = "issue2331.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images[0]