py-pdf · stefan6419846 · Sep 18, 2024 · Sep 17, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -649,7 +649,23 @@ def _find_eof_marker(self, stream: StreamType) -> None:
         """
         HEADER_SIZE = 8  # to parse whole file, Header is e.g. '%PDF-1.6'
         line = b""
+        first = True
         while line[:5] != b"%%EOF":
+            if line != b"" and first:
+                if any(
+                    line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%")
+                ):
+                    # I consider the file has truncated and
+                    # I have enough confidence to carry on
+                    logger_warning("EOF marker seems truncated", __name__)
+                    break
+                first = False
+            if b"startxref" in line:
+                logger_warning(
+                    "CAUTION : startxref found while searching for %%EOF\n"
+                    "This could the file is truncated and mean some data will not be read",
+                    __name__,
+                )
             if stream.tell() < HEADER_SIZE:
                 if self.strict:
                     raise PdfReadError("EOF marker not found")

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -1617,3 +1617,27 @@ def test_iss2817():
         reader.pages[0]["/Annots"][0].get_object()["/Contents"]
         == "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
     )
+
+
+@pytest.mark.enable_socket()
+def test_truncated_files(caplog):
+    """Cf #2853"""
+    url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf"
+    name = "iss2780.pdf"  # reused
+    b = get_data_from_url(url, name=name)
+    reader = PdfReader(BytesIO(b))
+    assert caplog.text == ""
+    # remove \n at end of file : invisible
+    reader = PdfReader(BytesIO(b[:-1]))
+    assert caplog.text == ""
+    # truncate but still detectable
+    for i in range(-2, -6, -1):
+        caplog.clear()
+        reader = PdfReader(BytesIO(b[:i]))
+        assert "EOF marker seems truncated" in caplog.text
+        assert reader._startxref == 100993
+    # remove completely EOF : we will not read last section
+    caplog.clear()
+    reader = PdfReader(BytesIO(b[:-6]))
+    assert "CAUTION : startxref found while searching for %%EOF" in caplog.text
+    assert reader._startxref < 100993