ROB: Fix corruption in startxref or xref table (#788)

Use PdfReadWarning instead of UserWarning to be consistent Closes #297
py-pdf · Apr 27, 2022 · 904b0df · 904b0df
1 parent 35086b6
commit 904b0df
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 23 deletions.
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -1837,6 +1837,35 @@ def read(self, stream):
             if line[:9] != b_("startxref"):
                 raise PdfReadError("startxref not found")
 
+        #check and eventually correct the startxref only in not strict
+        rebuildXrefTable = False
+        try:
+            stream.seek(startxref - 1,0) #-1 to check character before
+            line=stream.read(1)
+            if line not in b_("\r\n \t"):
+                raise PdfReadWarning("incorrect startxref pointer(1)",line)
+            line = stream.read(4)
+            if line != b_("xref"):
+                #not an xref so check if it is an XREF object
+                line = b_("")
+                while line in b_("0123456789 \t"):
+                    line = stream.read(1)
+                    if line == b_(""):
+                        raise PdfReadWarning("incorrect startxref pointer(2)")
+                line += stream.read(2)   #1 char already read, +2 to check "obj"
+                if line.lower() != b_("obj"):
+                    raise PdfReadWarning("incorrect startxref pointer(3)")
+                while stream.read(1) in b_(" \t\r\n"):
+                    pass;
+                line=stream.read(256) # check that it is xref obj
+                if b_("/xref") not in line.lower():
+                    raise PdfReadWarning("incorrect startxref pointer(4)")
+        except PdfReadWarning as e:
+            warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
+            if( not self.strict):
+                rebuildXrefTable = True
+            else:
+                raise
         # read all cross reference tables and their trailers
         self.xref = {}
         self.xref_objStm = {}
@@ -1922,6 +1951,30 @@ def read(self, stream):
                     startxref = newTrailer["/Prev"]
                 else:
                     break
+            elif rebuildXrefTable:
+                self.xref={}
+                stream.seek(0,0)
+                f_ = stream.read(-1)
+                import re
+                for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
+                    idnum = int(m.group(1))
+                    generation = int(m.group(2))
+                    if generation not in self.xref:
+                        self.xref[generation] = {}
+                    self.xref[generation][idnum] = m.start(1)
+                trailerPos = f_.rfind(b"trailer") - len(f_) + 7
+                stream.seek(trailerPos,2)
+                #code below duplicated
+                readNonWhitespace(stream)
+                stream.seek(-1, 1)
+                newTrailer = readObject(stream, self)
+                for key, value in list(newTrailer.items()):
+                    if key not in self.trailer:
+                        self.trailer[key] = value
+                #if "/Prev" in newTrailer:
+                #    startxref = newTrailer["/Prev"]
+                #else:
+                break
             elif x.isdigit():
                 # PDF 1.5+ Cross-Reference Stream
                 stream.seek(-1, 1)

diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -9,7 +9,7 @@
 from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
 from PyPDF2.constants import Ressources as RES
-from PyPDF2.errors import PdfReadError
+from PyPDF2.errors import PdfReadError, PdfReadWarning
 from PyPDF2.filters import _xobj_to_image
 
 if version_info < (3, 0):
@@ -184,15 +184,19 @@ def test_get_images(src, nb_images):
 
 
 @pytest.mark.parametrize(
-    "strict,with_prev_0,should_fail",
+    "strict,with_prev_0,startx_correction,should_fail",
     [
-        (True, True, True),
-        (True, False, False),
-        (False, True, False),
-        (False, False, False),
+        (True, False, -1, False), # all nominal => no fail
+        (True, True, -1, True),   # Prev=0 => fail expected
+        (False, False, -1, False),
+        (False, True, -1, False), # Prev =0 => no strict so tolerant
+        (True, False, 0, True),   # error on startxref, in strict => fail expected
+        (True, True, 0, True),
+        (False, False, 0, False), # error on startxref, but no strict => xref rebuilt,no fail
+        (False, True, 0, False),
     ],
 )
-def test_get_images_raw(strict, with_prev_0, should_fail):
+def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
     pdf_data = (
         b"%%PDF-1.7\n"
         b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
@@ -220,29 +224,32 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
         b"/Prev 0 " if with_prev_0 else b"",
-        pdf_data.find(b"xref"),
+        # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
+        pdf_data.find(b"xref") + startx_correction,
     )
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
-        with pytest.raises(PdfReadError) as exc:
+        with pytest.raises(Exception) as exc:
             PdfFileReader(pdf_stream, strict=strict)
-        assert (
-            exc.value.args[0]
-            == "/Prev=0 in the trailer (try opening with strict=False)"
-        )
+        if startx_correction != -1:
+            assert exc.type == PdfReadWarning
+        else:
+            assert (
+                exc.type == PdfReadError
+                and exc.value.args[0]
+                == "/Prev=0 in the trailer (try opening with strict=False)"
+            )
     else:
         PdfFileReader(pdf_stream, strict=strict)
 
 
-@pytest.mark.xfail(
-    reason=(
-        "It's still broken - and unclear what the issue is. "
-        "Help would be appreciated!"
-    )
-)
 def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
-    reader = PdfFileReader(path, "rb")
+    with pytest.raises(PdfReadWarning) as exc:
+        reader = PdfFileReader(path, strict=True)
+        reader.getPage(0)
+    assert "startxref" in exc.value.args[0]
+    reader = PdfFileReader(path, strict=False)
     reader.getPage(0)
 
 
@@ -384,7 +391,7 @@ def test_read_prev_0_trailer():
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
         b"/Prev 0 " if with_prev_0 else b"",
-        pdf_data.find(b"xref"),
+        pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
     with pytest.raises(PdfReadError) as exc:
@@ -419,7 +426,7 @@ def test_read_missing_startxref():
         pdf_data.find(b"3 0 obj"),
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
-        # pdf_data.find(b"xref"),
+        # pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
     with pytest.raises(PdfReadError) as exc:
@@ -455,7 +462,7 @@ def test_read_unknown_zero_pages():
         pdf_data.find(b"3 0 obj"),
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
-        pdf_data.find(b"xref"),
+        pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
     with pytest.raises(PdfReadError) as exc: