Skip to content

Commit

Permalink
ENH: ASCIIHexDecode.decode now returns bytes instead of str (#1994)
Browse files Browse the repository at this point in the history
Please note that this is potentially backwards-incompatible!

This also fixes a bug.

Closes  #1983
  • Loading branch information
pubpub-zz authored Jul 25, 2023
1 parent 890c93a commit 11ee648
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 18 deletions.
24 changes: 14 additions & 10 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,10 @@ class ASCIIHexDecode:

@staticmethod
def decode(
data: str,
data: Union[str, bytes],
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
**kwargs: Any,
) -> str:
) -> bytes:
"""
Decode an ASCII-Hex encoded data stream.
Expand All @@ -268,24 +268,26 @@ def decode(
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
retval = ""
hex_pair = ""
if isinstance(data, str):
data = data.encode()
retval = b""
hex_pair = b""
index = 0
while True:
if index >= len(data):
raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
char = data[index]
if char == ">":
char = data[index : index + 1]
if char == b">":
break
elif char.isspace():
index += 1
continue
hex_pair += char
if len(hex_pair) == 2:
retval += chr(int(hex_pair, base=16))
hex_pair = ""
retval += bytes((int(hex_pair, base=16),))
hex_pair = b""
index += 1
assert hex_pair == ""
assert hex_pair == b""
return retval


Expand Down Expand Up @@ -854,6 +856,8 @@ def _handle_jpx(

size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
data = x_object_obj.get_data() # type: ignore
if isinstance(data, str): # pragma: no cover
data = data.encode()
colors = x_object_obj.get("/Colors", 1)
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
if (
Expand Down Expand Up @@ -914,7 +918,7 @@ def _handle_jpx(
"TIFF",
".tiff",
)
elif lfilters is None:
else:
img, image_format, extension = Image.frombytes(mode, size, data), "PNG", ".png"

# CMYK image without decode requires reverting scale (cf p243,2§ last sentence)
Expand Down
29 changes: 21 additions & 8 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,26 +84,26 @@ def test_flate_decode_decompress_with_array_params(params):
@pytest.mark.parametrize(
("data", "expected"),
[
(">", ""),
(">", b""),
(
"6162636465666768696a6b6c6d6e6f707172737475767778797a>",
string.ascii_lowercase,
string.ascii_lowercase.encode(),
),
(
"4142434445464748494a4b4c4d4e4f505152535455565758595a>",
string.ascii_uppercase,
string.ascii_uppercase.encode(),
),
(
"6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464748494a4b4c4d4e4f505152535455565758595a>",
string.ascii_letters,
string.ascii_letters.encode(),
),
("30313233343536373839>", string.digits),
("30313233343536373839>", string.digits.encode()),
(
"3 031323334353637 3839>",
string.digits,
string.digits.encode(),
), # Same as previous, but whitespaced
("30313233343536373839616263646566414243444546>", string.hexdigits),
("20090a0d0b0c>", string.whitespace),
("30313233343536373839616263646566414243444546>", string.hexdigits.encode()),
("20090a0d0b0c>", string.whitespace.encode()),
],
ids=[
"empty",
Expand Down Expand Up @@ -135,6 +135,19 @@ def test_ascii_hex_decode_missing_eod():
assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode"


@pytest.mark.enable_socket()
def test_decode_ahx():
"""
See #1979
Gray Image in CMYK : requiring reverse
"""
url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
name = "NewJersey.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for p in reader.pages:
_ = list(p.images.keys())


@pytest.mark.xfail()
def test_ascii85decode_with_overflow():
inputs = (
Expand Down

0 comments on commit 11ee648

Please sign in to comment.