From a3742ae79f03de4527cc58b26cf44b731ae88777 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 2 Dec 2023 09:35:08 +0100 Subject: [PATCH] BUG: Cope with deflated images with CMYK Black Only (#2322) Closes #2321 --- pypdf/_xobj_image_helpers.py | 8 ++++++++ pypdf/filters.py | 16 +++++++++------- tests/test_images.py | 16 ++++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 87bff19b9..515c01ebe 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -91,10 +91,18 @@ def _get_imagemode( ) return mode2, True elif color_space[0] == "/DeviceN": + original_color_space = color_space color_components = len(color_space[1]) color_space = color_space[2] if isinstance(color_space, IndirectObject): # pragma: no cover color_space = color_space.get_object() + if color_space == "/DeviceCMYK" and color_components == 1: + if original_color_space[1][0] != "/Black": + logger_warning( + f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team", + __name__, + ) + return "L", True mode2, invert_color = _get_imagemode( color_space, color_components, prev_mode, depth + 1 ) diff --git a/pypdf/filters.py b/pypdf/filters.py index 3ded07d8c..d1c06a341 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -558,13 +558,16 @@ class CCITTFaxDecode: @staticmethod def _get_parameters( - parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject], rows: int + parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject], + rows: int, ) -> CCITParameters: # TABLE 3.9 Optional parameters for the CCITTFaxDecode filter k = 0 columns = 1728 if parameters: - parameters_unwrapped = cast(Union[ArrayObject, DictionaryObject], parameters.get_object()) + parameters_unwrapped = cast( + Union[ArrayObject, DictionaryObject], parameters.get_object() + ) if isinstance(parameters_unwrapped, ArrayObject): for decode_parm in parameters_unwrapped: if CCITT.COLUMNS in decode_parm: @@ -778,8 +781,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, alpha = None filters = x_object_obj.get(SA.FILTER, [None]) lfilters = filters[-1] if isinstance(filters, list) else filters - if lfilters == FT.FLATE_DECODE: - img, image_format, extension, invert_color = _handle_flate( + if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): + img, image_format, extension, _ = _handle_flate( size, data, mode, @@ -821,15 +824,14 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, ".png", False, ) - # CMYK image and other colorspaces without decode # requires reverting scale (cf p243,2ยง last sentence) decode = x_object_obj.get( IA.DECODE, ([1.0, 0.0] * len(img.getbands())) if ( - (img.mode == "CMYK" or (invert_color and img.mode == "L")) - and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE) + (img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)) + or (invert_color and img.mode == "L") ) else None, ) diff --git a/tests/test_images.py b/tests/test_images.py index 9e03c9f35..a309549e6 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -220,3 +220,19 @@ def test_loop_in_image_keys(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0]["/Resources"]["/XObject"][NameObject("/toto")] = NullObject() reader.pages[0].images.keys() + + +@pytest.mark.enable_socket() +def test_devicen_cmyk_black_only(): + """Cf #2321""" + url = "https://github.com/py-pdf/pypdf/files/13501846/Addressing_Adversarial_Attacks.pdf" + name = "iss2321.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/cc2dabc1-86e6-4179-a8a4-2b0efea124be" + name = "iss2321_img0.pdf" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[5].images[0].image, img) >= 0.99 + url = "https://github.com/py-pdf/pypdf/assets/4083478/6b64a949-42be-40d5-9eea-95707f350d89" + name = "iss2321_img1.pdf" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[10].images[0].image, img) >= 0.99