From 11ee6480a3f795d770da89944f32a977e3c110e2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 25 Jul 2023 22:09:06 +0200 Subject: [PATCH] ENH: ASCIIHexDecode.decode now returns bytes instead of str (#1994) Please note that this is potentially backwards-incompatible! This also fixes a bug. Closes #1983 --- pypdf/filters.py | 24 ++++++++++++++---------- tests/test_filters.py | 29 +++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index a3329554a..e6f311529 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -245,10 +245,10 @@ class ASCIIHexDecode: @staticmethod def decode( - data: str, + data: Union[str, bytes], decode_parms: Union[None, ArrayObject, DictionaryObject] = None, **kwargs: Any, - ) -> str: + ) -> bytes: """ Decode an ASCII-Hex encoded data stream. @@ -268,24 +268,26 @@ def decode( if "decodeParms" in kwargs: # deprecated deprecate_with_replacement("decodeParms", "parameters", "4.0.0") decode_parms = kwargs["decodeParms"] # noqa: F841 - retval = "" - hex_pair = "" + if isinstance(data, str): + data = data.encode() + retval = b"" + hex_pair = b"" index = 0 while True: if index >= len(data): raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") - char = data[index] - if char == ">": + char = data[index : index + 1] + if char == b">": break elif char.isspace(): index += 1 continue hex_pair += char if len(hex_pair) == 2: - retval += chr(int(hex_pair, base=16)) - hex_pair = "" + retval += bytes((int(hex_pair, base=16),)) + hex_pair = b"" index += 1 - assert hex_pair == "" + assert hex_pair == b"" return retval @@ -854,6 +856,8 @@ def _handle_jpx( size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) data = x_object_obj.get_data() # type: ignore + if isinstance(data, str): # pragma: no cover + data = data.encode() colors = x_object_obj.get("/Colors", 1) color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object() if ( @@ -914,7 +918,7 @@ def _handle_jpx( "TIFF", ".tiff", ) - elif lfilters is None: + else: img, image_format, extension = Image.frombytes(mode, size, data), "PNG", ".png" # CMYK image without decode requires reverting scale (cf p243,2ยง last sentence) diff --git a/tests/test_filters.py b/tests/test_filters.py index 2f4df02ef..76f49430e 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -84,26 +84,26 @@ def test_flate_decode_decompress_with_array_params(params): @pytest.mark.parametrize( ("data", "expected"), [ - (">", ""), + (">", b""), ( "6162636465666768696a6b6c6d6e6f707172737475767778797a>", - string.ascii_lowercase, + string.ascii_lowercase.encode(), ), ( "4142434445464748494a4b4c4d4e4f505152535455565758595a>", - string.ascii_uppercase, + string.ascii_uppercase.encode(), ), ( "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464748494a4b4c4d4e4f505152535455565758595a>", - string.ascii_letters, + string.ascii_letters.encode(), ), - ("30313233343536373839>", string.digits), + ("30313233343536373839>", string.digits.encode()), ( "3 031323334353637 3839>", - string.digits, + string.digits.encode(), ), # Same as previous, but whitespaced - ("30313233343536373839616263646566414243444546>", string.hexdigits), - ("20090a0d0b0c>", string.whitespace), + ("30313233343536373839616263646566414243444546>", string.hexdigits.encode()), + ("20090a0d0b0c>", string.whitespace.encode()), ], ids=[ "empty", @@ -135,6 +135,19 @@ def test_ascii_hex_decode_missing_eod(): assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode" +@pytest.mark.enable_socket() +def test_decode_ahx(): + """ + See #1979 + Gray Image in CMYK : requiring reverse + """ + url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" + name = "NewJersey.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for p in reader.pages: + _ = list(p.images.keys()) + + @pytest.mark.xfail() def test_ascii85decode_with_overflow(): inputs = (