Skip to content

Commit

Permalink
BUG: tolerate truncated files and no warning when jumping startxref
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Sep 17, 2024
1 parent c00ec60 commit 7994a83
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
16 changes: 16 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,23 @@ def _find_eof_marker(self, stream: StreamType) -> None:
"""
HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6'
line = b""
first = True
while line[:5] != b"%%EOF":
if line != b"" and first:
if any(
line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%")
):
# I consider the file has truncated and
# I have enough confidence to carry on
logger_warning("EOF marker seems truncated", __name__)
break
first = False
if b"startxref" in line:
logger_warning(
"CAUTION : startxref found while searching for %%EOF\n"
"This could the file is truncated and mean some data will not be read",
__name__,
)
if stream.tell() < HEADER_SIZE:
if self.strict:
raise PdfReadError("EOF marker not found")
Expand Down
24 changes: 24 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,3 +1617,27 @@ def test_iss2817():
reader.pages[0]["/Annots"][0].get_object()["/Contents"]
== "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
)


@pytest.mark.enable_socket()
def test_truncated_files(caplog):
"""Cf #2853"""
url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf"
name = "iss2780.pdf" # reused
b = get_data_from_url(url, name=name)
reader = PdfReader(BytesIO(b))
assert caplog.text == ""
# remove \n at end of file : invisible
reader = PdfReader(BytesIO(b[:-1]))
assert caplog.text == ""
# truncate but still detectable
for i in range(-2, -6, -1):
caplog.clear()
reader = PdfReader(BytesIO(b[:i]))
assert "EOF marker seems truncated" in caplog.text
assert reader._startxref == 100993
# remove completely EOF : we will not read last section
caplog.clear()
reader = PdfReader(BytesIO(b[:-6]))
assert "CAUTION : startxref found while searching for %%EOF" in caplog.text
assert reader._startxref < 100993

0 comments on commit 7994a83

Please sign in to comment.