From 19efa1d9a52c88328b49b6e1ec0dfe3e748e7176 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 20 Apr 2022 22:30:55 +0200
Subject: [PATCH 1/6] Fix #297 : fix corruption in startxref or xref table

    Proposal for #297

    Issue is due to corrupted startxref pointer and/or xref table
    Fixed proposed if incorrect pointer look for all objs to rebuild an xref table

    Tests updated
---
 PyPDF2/pdf.py        | 57 +++++++++++++++++++++++++++++++++++++++++++-
 Tests/test_reader.py | 19 ++++++---------
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
index 47430221b..f5fcab3a1 100644
--- a/PyPDF2/pdf.py
+++ b/PyPDF2/pdf.py
@@ -1843,6 +1843,36 @@ def read(self, stream):
             if line[:9] != b_("startxref"):
                 raise PdfReadError("startxref not found")
 
+        #check and eventually correct the startxref only in not strict
+        rebuildXrefTable = False
+        try:
+            stream.seek(startxref - 1,0) #-1 to check character before
+            line=stream.read(1) 
+            if line not in b_("\r\n \t"):
+                raise UserWarning("incorrect startxref pointer(1)",line)
+            line = stream.read(4)
+            if line != b_("xref"):
+                #not an xref so check if it is an XREF object
+                line = b_("")
+                while line in b_("0123456789 \t"):
+                    line = stream.read(1)
+                    if line == b_(""):
+                        raise UserWarning("incorrect startxref pointer(2)")
+                line += stream.read(2)   #1 char already read, +2 to check "obj"
+                if line.lower() != b_("obj"):
+                    raise UserWarning("incorrect startxref pointer(3)")
+                while stream.read(1) in b_(" \t\r\n"):
+                    pass;  
+                line=stream.read(256) # check that it is xref obj
+                if b_("/xref") not in line.lower():
+                    raise UserWarning("incorrect startxref pointer(4)")
+        except UserWarning as e:
+            #import traceback;print(traceback.format_exc())
+            warnings.warn(str(e)+", need to rebuild xref table")
+            if( not self.strict):
+                rebuildXrefTable = True
+            else:
+                raise                     
         # read all cross reference tables and their trailers
         self.xref = {}
         self.xref_objStm = {}
@@ -1927,7 +1957,32 @@ def read(self, stream):
                 if "/Prev" in newTrailer:
                     startxref = newTrailer["/Prev"]
                 else:
-                    break
+                    break               
+            elif rebuildXrefTable:
+                self.xref={}
+                p_ = stream.tell()
+                stream.seek(0,0)
+                f_ = stream.read(-1)
+                import re
+                for m in re.finditer(b_("[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
+                    idnum = int(m.group(1))
+                    generation = int(m.group(2))
+                    if generation not in self.xref:
+                        self.xref[generation] = {}
+                    self.xref[generation][idnum] = m.start(1)
+                trailerPos = f_.rfind(b"trailer") - len(f_) + 7                
+                stream.seek(trailerPos,2)
+                #code below duplicated
+                readNonWhitespace(stream)
+                stream.seek(-1, 1)
+                newTrailer = readObject(stream, self)
+                for key, value in list(newTrailer.items()):
+                    if key not in self.trailer:
+                        self.trailer[key] = value
+                if "/Prev" in newTrailer:
+                    startxref = newTrailer["/Prev"]
+                else:
+                    break                
             elif x.isdigit():
                 # PDF 1.5+ Cross-Reference Stream
                 stream.seek(-1, 1)
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
index 0d1d6e906..720a5483f 100644
--- a/Tests/test_reader.py
+++ b/Tests/test_reader.py
@@ -200,7 +200,7 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
         b"/Prev 0 " if with_prev_0 else b"",
-        pdf_data.find(b"xref"),
+        pdf_data.find(b"xref") - 1, # -1 due to double % at the beginning
     )
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
@@ -214,16 +214,11 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
         PdfFileReader(pdf_stream, strict=strict)
 
 
-@pytest.mark.xfail(
-    reason=(
-        "It's still broken - and unclear what the issue is. "
-        "Help would be appreciated!"
-    )
-)
 def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
-    reader = PdfFileReader(path, "rb")
-    reader.getPage(0)
+    with pytest.raises(UserWarning):
+        reader = PdfFileReader(path, "rb")
+        reader.getPage(0)
 
 
 def test_get_page_of_encrypted_file():
@@ -353,7 +348,7 @@ def test_read_prev_0_trailer():
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
         b"/Prev 0 " if with_prev_0 else b"",
-        pdf_data.find(b"xref"),
+        pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
     with pytest.raises(PdfReadError) as exc:
@@ -388,7 +383,7 @@ def test_read_missing_startxref():
         pdf_data.find(b"3 0 obj"),
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
-        # pdf_data.find(b"xref"),
+        # pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
     with pytest.raises(PdfReadError) as exc:
@@ -424,7 +419,7 @@ def test_read_unknown_zero_pages():
         pdf_data.find(b"3 0 obj"),
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
-        pdf_data.find(b"xref"),
+        pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
     with pytest.raises(PdfReadError) as exc:

From 706d4c56339b3f785219ccfa67efb47083a47322 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 20 Apr 2022 23:35:15 +0200
Subject: [PATCH 2/6] cleanup flake8 report

---
 PyPDF2/pdf.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
index f5fcab3a1..0e9e406e6 100644
--- a/PyPDF2/pdf.py
+++ b/PyPDF2/pdf.py
@@ -1847,7 +1847,7 @@ def read(self, stream):
         rebuildXrefTable = False
         try:
             stream.seek(startxref - 1,0) #-1 to check character before
-            line=stream.read(1) 
+            line=stream.read(1)
             if line not in b_("\r\n \t"):
                 raise UserWarning("incorrect startxref pointer(1)",line)
             line = stream.read(4)
@@ -1862,7 +1862,7 @@ def read(self, stream):
                 if line.lower() != b_("obj"):
                     raise UserWarning("incorrect startxref pointer(3)")
                 while stream.read(1) in b_(" \t\r\n"):
-                    pass;  
+                    pass;
                 line=stream.read(256) # check that it is xref obj
                 if b_("/xref") not in line.lower():
                     raise UserWarning("incorrect startxref pointer(4)")
@@ -1872,7 +1872,7 @@ def read(self, stream):
             if( not self.strict):
                 rebuildXrefTable = True
             else:
-                raise                     
+                raise
         # read all cross reference tables and their trailers
         self.xref = {}
         self.xref_objStm = {}
@@ -1957,20 +1957,19 @@ def read(self, stream):
                 if "/Prev" in newTrailer:
                     startxref = newTrailer["/Prev"]
                 else:
-                    break               
+                    break
             elif rebuildXrefTable:
                 self.xref={}
-                p_ = stream.tell()
                 stream.seek(0,0)
                 f_ = stream.read(-1)
                 import re
-                for m in re.finditer(b_("[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
+                for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
                     idnum = int(m.group(1))
                     generation = int(m.group(2))
                     if generation not in self.xref:
                         self.xref[generation] = {}
                     self.xref[generation][idnum] = m.start(1)
-                trailerPos = f_.rfind(b"trailer") - len(f_) + 7                
+                trailerPos = f_.rfind(b"trailer") - len(f_) + 7
                 stream.seek(trailerPos,2)
                 #code below duplicated
                 readNonWhitespace(stream)
@@ -1982,7 +1981,7 @@ def read(self, stream):
                 if "/Prev" in newTrailer:
                     startxref = newTrailer["/Prev"]
                 else:
-                    break                
+                    break
             elif x.isdigit():
                 # PDF 1.5+ Cross-Reference Stream
                 stream.seek(-1, 1)

From e61e4696dbb71dc41d44ed1f94866e96b2ab502e Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sat, 23 Apr 2022 13:23:46 +0200
Subject: [PATCH 3/6] review Test

cf comments in https://github.com/py-pdf/PyPDF2/pull/788#discussion_r856328104

Test in strict=False and True;
use PdfReadWarning instead of UserWarniing to be consistant with other raising
---
 PyPDF2/pdf.py        | 13 ++++++-------
 Tests/test_reader.py | 10 +++++++---
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
index 0e9e406e6..149e47ad3 100644
--- a/PyPDF2/pdf.py
+++ b/PyPDF2/pdf.py
@@ -1849,7 +1849,7 @@ def read(self, stream):
             stream.seek(startxref - 1,0) #-1 to check character before
             line=stream.read(1)
             if line not in b_("\r\n \t"):
-                raise UserWarning("incorrect startxref pointer(1)",line)
+                raise PdfReadWarning("incorrect startxref pointer(1)",line)
             line = stream.read(4)
             if line != b_("xref"):
                 #not an xref so check if it is an XREF object
@@ -1857,18 +1857,17 @@ def read(self, stream):
                 while line in b_("0123456789 \t"):
                     line = stream.read(1)
                     if line == b_(""):
-                        raise UserWarning("incorrect startxref pointer(2)")
+                        raise PdfReadWarning("incorrect startxref pointer(2)")
                 line += stream.read(2)   #1 char already read, +2 to check "obj"
                 if line.lower() != b_("obj"):
-                    raise UserWarning("incorrect startxref pointer(3)")
+                    raise PdfReadWarning("incorrect startxref pointer(3)")
                 while stream.read(1) in b_(" \t\r\n"):
                     pass;
                 line=stream.read(256) # check that it is xref obj
                 if b_("/xref") not in line.lower():
-                    raise UserWarning("incorrect startxref pointer(4)")
-        except UserWarning as e:
-            #import traceback;print(traceback.format_exc())
-            warnings.warn(str(e)+", need to rebuild xref table")
+                    raise PdfReadWarning("incorrect startxref pointer(4)")
+        except PdfReadWarning as e:
+            warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
             if( not self.strict):
                 rebuildXrefTable = True
             else:
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
index 720a5483f..852f4a567 100644
--- a/Tests/test_reader.py
+++ b/Tests/test_reader.py
@@ -7,7 +7,7 @@
 from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
 from PyPDF2.constants import Ressources as RES
-from PyPDF2.errors import PdfReadError
+from PyPDF2.errors import PdfReadError,PdfReadWarning
 from PyPDF2.filters import _xobj_to_image
 
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
@@ -216,9 +216,13 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
 
 def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
-    with pytest.raises(UserWarning):
-        reader = PdfFileReader(path, "rb")
+    with pytest.raises(PdfReadWarning) as exc:
+        print(exc)
+        reader = PdfFileReader(path,strict=True)
         reader.getPage(0)
+    assert ( exc.value.args[0].find("startxref") > 0)
+    reader = PdfFileReader(path,strict=False)
+    reader.getPage(0)
 
 
 def test_get_page_of_encrypted_file():

From cd2df385d0fd91c03513f190cb0f79db4f57958d Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 24 Apr 2022 10:35:36 +0200
Subject: [PATCH 4/6] reformatting with black

applied only on test_reader as recommanded
---
 Tests/test_reader.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Tests/test_reader.py b/Tests/test_reader.py
index 852f4a567..7052013bd 100644
--- a/Tests/test_reader.py
+++ b/Tests/test_reader.py
@@ -7,7 +7,7 @@
 from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
 from PyPDF2.constants import Ressources as RES
-from PyPDF2.errors import PdfReadError,PdfReadWarning
+from PyPDF2.errors import PdfReadError, PdfReadWarning
 from PyPDF2.filters import _xobj_to_image
 
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
@@ -200,7 +200,7 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
         b"/Prev 0 " if with_prev_0 else b"",
-        pdf_data.find(b"xref") - 1, # -1 due to double % at the beginning
+        pdf_data.find(b"xref") - 1,  # -1 due to double % at the beginning
     )
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
@@ -218,10 +218,10 @@ def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
     with pytest.raises(PdfReadWarning) as exc:
         print(exc)
-        reader = PdfFileReader(path,strict=True)
+        reader = PdfFileReader(path, strict=True)
         reader.getPage(0)
-    assert ( exc.value.args[0].find("startxref") > 0)
-    reader = PdfFileReader(path,strict=False)
+    assert exc.value.args[0].find("startxref") > 0
+    reader = PdfFileReader(path, strict=False)
     reader.getPage(0)
 
 

From c7ecd8ce7e4a506e07ef8c2f5d59cab42e0c99a1 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 24 Apr 2022 16:27:00 +0200
Subject: [PATCH 5/6] extend test

Test completed to cover old cases
loop detected and fix if rebuilding xref table
---
 PyPDF2/pdf.py        |  8 ++++----
 Tests/test_reader.py | 36 ++++++++++++++++++++++++------------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
index 149e47ad3..33fe7133d 100644
--- a/PyPDF2/pdf.py
+++ b/PyPDF2/pdf.py
@@ -1977,10 +1977,10 @@ def read(self, stream):
                 for key, value in list(newTrailer.items()):
                     if key not in self.trailer:
                         self.trailer[key] = value
-                if "/Prev" in newTrailer:
-                    startxref = newTrailer["/Prev"]
-                else:
-                    break
+                #if "/Prev" in newTrailer:
+                #    startxref = newTrailer["/Prev"]
+                #else:
+                break
             elif x.isdigit():
                 # PDF 1.5+ Cross-Reference Stream
                 stream.seek(-1, 1)
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
index 7052013bd..927182805 100644
--- a/Tests/test_reader.py
+++ b/Tests/test_reader.py
@@ -164,15 +164,19 @@ def test_get_images(src, nb_images):
 
 
 @pytest.mark.parametrize(
-    "strict,with_prev_0,should_fail",
+    "strict,with_prev_0,startx_correction,should_fail",
     [
-        (True, True, True),
-        (True, False, False),
-        (False, True, False),
-        (False, False, False),
+        (True, False, -1, False), # all nominal => no fail
+        (True, True, -1, True),   # Prev=0 => fail expected
+        (False, False, -1, False),
+        (False, True, -1, False), # Prev =0 => no strict so tolerant
+        (True, False, 0, True),   # error on startxref, in strict => fail expected
+        (True, True, 0, True),
+        (False, False, 0, False), # error on startxref, but no strict => xref rebuilt,no fail
+        (False, True, 0, False),
     ],
 )
-def test_get_images_raw(strict, with_prev_0, should_fail):
+def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
     pdf_data = (
         b"%%PDF-1.7\n"
         b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
@@ -200,16 +204,24 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
         pdf_data.find(b"4 0 obj"),
         pdf_data.find(b"5 0 obj"),
         b"/Prev 0 " if with_prev_0 else b"",
-        pdf_data.find(b"xref") - 1,  # -1 due to double % at the beginning
+        # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
+        pdf_data.find(b"xref") + startx_correction,
     )
+    print((strict, with_prev_0, should_fail))
+    print(pdf_data)
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
-        with pytest.raises(PdfReadError) as exc:
+        with pytest.raises(Exception) as exc:
             PdfFileReader(pdf_stream, strict=strict)
-        assert (
-            exc.value.args[0]
-            == "/Prev=0 in the trailer (try opening with strict=False)"
-        )
+        print(exc.type)
+        if startx_correction != -1:
+            assert exc.type == PdfReadWarning
+        else:
+            assert (
+                exc.type == PdfReadError
+                and exc.value.args[0]
+                == "/Prev=0 in the trailer (try opening with strict=False)"
+            )
     else:
         PdfFileReader(pdf_stream, strict=strict)
 

From 28522480349212d48c6d6ddade56280b68bfa53b Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 24 Apr 2022 17:04:25 +0200
Subject: [PATCH 6/6] cleanup

https://github.com/py-pdf/PyPDF2/pull/788#discussion_r857136971
---
 Tests/test_reader.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Tests/test_reader.py b/Tests/test_reader.py
index 927182805..ff7ae6aa0 100644
--- a/Tests/test_reader.py
+++ b/Tests/test_reader.py
@@ -207,13 +207,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
         # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
         pdf_data.find(b"xref") + startx_correction,
     )
-    print((strict, with_prev_0, should_fail))
-    print(pdf_data)
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
         with pytest.raises(Exception) as exc:
             PdfFileReader(pdf_stream, strict=strict)
-        print(exc.type)
         if startx_correction != -1:
             assert exc.type == PdfReadWarning
         else:
@@ -229,10 +226,9 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
 def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
     with pytest.raises(PdfReadWarning) as exc:
-        print(exc)
         reader = PdfFileReader(path, strict=True)
         reader.getPage(0)
-    assert exc.value.args[0].find("startxref") > 0
+    assert "startxref" in exc.value.args[0]
     reader = PdfFileReader(path, strict=False)
     reader.getPage(0)