Skip to content

Commit

Permalink
ENH: Accept inline images with space before EI (#1552)
Browse files Browse the repository at this point in the history
Closes #1541
  • Loading branch information
pubpub-zz authored Jan 16, 2023
1 parent f0c0a1d commit df90053
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 17 deletions.
50 changes: 34 additions & 16 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
# We have reached the end of the stream, but haven't found the EI operator.
if not buf:
raise PdfReadError("Unexpected end of stream")
loc = buf.find(b"E")
loc = buf.find(
b"E"
) # we can not look straight for "EI" because it may not have been loaded in the buffer

if loc == -1:
data.write(buf)
Expand All @@ -1026,28 +1028,44 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:

# Seek back in the stream to read the E next.
stream.seek(loc - len(buf), 1)
tok = stream.read(1)
tok = stream.read(1) # E of "EI"
# Check for End Image
tok2 = stream.read(1)
if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES:
# Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required.
tok3 = stream.read(1)
info = tok + tok2
# We need to find at least one whitespace after.
has_q_whitespace = False
tok2 = stream.read(1) # I of "EI"
if tok2 != b"I":
stream.seek(-1, 1)
data.write(tok)
continue
# for further debug : print("!!!!",buf[loc-1:loc+10])
info = tok + tok2
tok3 = stream.read(
1
) # possible space after "EI" may not been loaded in buf
if tok3 not in WHITESPACES:
stream.seek(-2, 1) # to step back on I
data.write(tok)
elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES:
# Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required.
while tok3 in WHITESPACES:
# needed ???? : info += tok3
tok3 = stream.read(1)
stream.seek(-1, 1)
# we do not insert EI
break
else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES:
# Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars.
while tok3 in WHITESPACES:
has_q_whitespace = True
info += tok3
tok3 = stream.read(1)
if has_q_whitespace:
stream.seek(-1, 1)
stream.seek(-1, 1)
if tok3 == b"Q":
break
elif tok3 == b"E":
ope = stream.read(3)
stream.seek(-3, 1)
if ope == b"EMC":
break
else:
stream.seek(-1, 1)
data.write(info)
else:
stream.seek(-1, 1)
data.write(tok)
return {"settings": settings, "data": data.getvalue()}

@property
Expand Down
36 changes: 35 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

from pypdf import PdfMerger, PdfReader, PdfWriter
from pypdf.constants import PageAttributes as PG
from pypdf.errors import PdfReadWarning
from pypdf.errors import PdfReadError, PdfReadWarning
from pypdf.generic import ContentStream, read_object

from . import get_pdf_from_url, normalize_warnings

Expand Down Expand Up @@ -880,3 +881,36 @@ def test_tounicode_is_identity():
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data, strict=False)
reader.pages[0].extract_text()


@pytest.mark.external
def test_extra_test_iss1541():
url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf"
name = "tst_iss1541.pdf"
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data, strict=False)
reader.pages[0].extract_text()

cs = ContentStream(reader.pages[0]["/Contents"], None, None)
cs.operations.insert(-1, ([], b"EMC"))
bu = BytesIO()
cs.write_to_stream(bu, None)
bu.seek(0)
ContentStream(read_object(bu, None, None), None, None).operations

cs = ContentStream(reader.pages[0]["/Contents"], None, None)
cs.operations.insert(-1, ([], b"E!C"))
bu = BytesIO()
cs.write_to_stream(bu, None)
bu.seek(0)
with pytest.raises(PdfReadError) as exc:
ContentStream(read_object(bu, None, None), None, None).operations
assert exc.value.args[0] == "Unexpected end of stream"

buf2 = BytesIO(data.getbuffer())
reader = PdfReader(
BytesIO(bytes(buf2.getbuffer()).replace(b"EI \n", b"E! \n")), strict=False
)
with pytest.raises(PdfReadError) as exc:
reader.pages[0].extract_text()
assert exc.value.args[0] == "Unexpected end of stream"

0 comments on commit df90053

Please sign in to comment.