From df90053c3e673dfafafba8557e1d379883b62dc0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 16 Jan 2023 20:55:43 +0100 Subject: [PATCH] ENH: Accept inline images with space before EI (#1552) Closes #1541 --- pypdf/generic/_data_structures.py | 50 +++++++++++++++++++++---------- tests/test_workflows.py | 36 +++++++++++++++++++++- 2 files changed, 69 insertions(+), 17 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2e472f51c..57cc1beaa 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1016,7 +1016,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # We have reached the end of the stream, but haven't found the EI operator. if not buf: raise PdfReadError("Unexpected end of stream") - loc = buf.find(b"E") + loc = buf.find( + b"E" + ) # we can not look straight for "EI" because it may not have been loaded in the buffer if loc == -1: data.write(buf) @@ -1026,28 +1028,44 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # Seek back in the stream to read the E next. stream.seek(loc - len(buf), 1) - tok = stream.read(1) + tok = stream.read(1) # E of "EI" # Check for End Image - tok2 = stream.read(1) - if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES: - # Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required. - tok3 = stream.read(1) - info = tok + tok2 - # We need to find at least one whitespace after. - has_q_whitespace = False + tok2 = stream.read(1) # I of "EI" + if tok2 != b"I": + stream.seek(-1, 1) + data.write(tok) + continue + # for further debug : print("!!!!",buf[loc-1:loc+10]) + info = tok + tok2 + tok3 = stream.read( + 1 + ) # possible space after "EI" may not been loaded in buf + if tok3 not in WHITESPACES: + stream.seek(-2, 1) # to step back on I + data.write(tok) + elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES: + # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required. + while tok3 in WHITESPACES: + # needed ???? : info += tok3 + tok3 = stream.read(1) + stream.seek(-1, 1) + # we do not insert EI + break + else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES: + # Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars. while tok3 in WHITESPACES: - has_q_whitespace = True info += tok3 tok3 = stream.read(1) - if has_q_whitespace: - stream.seek(-1, 1) + stream.seek(-1, 1) + if tok3 == b"Q": break + elif tok3 == b"E": + ope = stream.read(3) + stream.seek(-3, 1) + if ope == b"EMC": + break else: - stream.seek(-1, 1) data.write(info) - else: - stream.seek(-1, 1) - data.write(tok) return {"settings": settings, "data": data.getvalue()} @property diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 63f822830..8209f6db1 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -15,7 +15,8 @@ from pypdf import PdfMerger, PdfReader, PdfWriter from pypdf.constants import PageAttributes as PG -from pypdf.errors import PdfReadWarning +from pypdf.errors import PdfReadError, PdfReadWarning +from pypdf.generic import ContentStream, read_object from . import get_pdf_from_url, normalize_warnings @@ -880,3 +881,36 @@ def test_tounicode_is_identity(): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data, strict=False) reader.pages[0].extract_text() + + +@pytest.mark.external +def test_extra_test_iss1541(): + url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf" + name = "tst_iss1541.pdf" + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data, strict=False) + reader.pages[0].extract_text() + + cs = ContentStream(reader.pages[0]["/Contents"], None, None) + cs.operations.insert(-1, ([], b"EMC")) + bu = BytesIO() + cs.write_to_stream(bu, None) + bu.seek(0) + ContentStream(read_object(bu, None, None), None, None).operations + + cs = ContentStream(reader.pages[0]["/Contents"], None, None) + cs.operations.insert(-1, ([], b"E!C")) + bu = BytesIO() + cs.write_to_stream(bu, None) + bu.seek(0) + with pytest.raises(PdfReadError) as exc: + ContentStream(read_object(bu, None, None), None, None).operations + assert exc.value.args[0] == "Unexpected end of stream" + + buf2 = BytesIO(data.getbuffer()) + reader = PdfReader( + BytesIO(bytes(buf2.getbuffer()).replace(b"EI \n", b"E! \n")), strict=False + ) + with pytest.raises(PdfReadError) as exc: + reader.pages[0].extract_text() + assert exc.value.args[0] == "Unexpected end of stream"