MAINT: Refactoring after #788 (#830)

This refactoring aims at making maintenance easier: 1. Too long functions make it hard to grasp the overall behavior. Hence the _get_xref_issues function was split out 2. `_get_xref_issues` is made a static method of the PdfFileReader to show that it belongs to the reader, but doesn't require any of its attributes 3. `_get_xref_issues` makes use of an integer return value instead of raising + catching exceptions. 4. `_rebuild_xref_table` was moved to a method for the same reason.
py-pdf · Apr 28, 2022 · ebe03d3 · ebe03d3
1 parent 1df2e4b
commit ebe03d3
Show file tree

Hide file tree

Showing 2 changed files with 176 additions and 161 deletions.
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -1837,35 +1837,15 @@ def read(self, stream):
             if line[:9] != b_("startxref"):
                 raise PdfReadError("startxref not found")
 
-        #check and eventually correct the startxref only in not strict
-        rebuildXrefTable = False
-        try:
-            stream.seek(startxref - 1,0) #-1 to check character before
-            line=stream.read(1)
-            if line not in b_("\r\n \t"):
-                raise PdfReadWarning("incorrect startxref pointer(1)",line)
-            line = stream.read(4)
-            if line != b_("xref"):
-                #not an xref so check if it is an XREF object
-                line = b_("")
-                while line in b_("0123456789 \t"):
-                    line = stream.read(1)
-                    if line == b_(""):
-                        raise PdfReadWarning("incorrect startxref pointer(2)")
-                line += stream.read(2)   #1 char already read, +2 to check "obj"
-                if line.lower() != b_("obj"):
-                    raise PdfReadWarning("incorrect startxref pointer(3)")
-                while stream.read(1) in b_(" \t\r\n"):
-                    pass;
-                line=stream.read(256) # check that it is xref obj
-                if b_("/xref") not in line.lower():
-                    raise PdfReadWarning("incorrect startxref pointer(4)")
-        except PdfReadWarning as e:
-            warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
-            if( not self.strict):
-                rebuildXrefTable = True
-            else:
-                raise
+        # check and eventually correct the startxref only in not strict
+        xref_issue_nr = self._get_xref_issues(stream, startxref)
+        if self.strict and xref_issue_nr:
+            raise PdfReadError("Broken xref table")
+        else:
+            warnings.warn(
+                "incorrect startxref pointer({})".format(xref_issue_nr), PdfReadWarning
+            )
+
         # read all cross reference tables and their trailers
         self.xref = {}
         self.xref_objStm = {}
@@ -1875,72 +1855,7 @@ def read(self, stream):
             stream.seek(startxref, 0)
             x = stream.read(1)
             if x == b_("x"):
-                # standard cross-reference table
-                ref = stream.read(4)
-                if ref[:3] != b_("ref"):
-                    raise PdfReadError("xref table read error")
-                readNonWhitespace(stream)
-                stream.seek(-1, 1)
-                firsttime = True; # check if the first time looking at the xref table
-                while True:
-                    num = readObject(stream, self)
-                    if firsttime and num != 0:
-                         self.xrefIndex = num
-                         if self.strict:
-                            warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", PdfReadWarning)
-                            # if table not zero indexed, could be due to error from when PDF was created
-                            # which will lead to mismatched indices later on, only warned and corrected if self.strict=True
-                    firsttime = False
-                    readNonWhitespace(stream)
-                    stream.seek(-1, 1)
-                    size = readObject(stream, self)
-                    readNonWhitespace(stream)
-                    stream.seek(-1, 1)
-                    cnt = 0
-                    while cnt < size:
-                        line = stream.read(20)
-
-                        # It's very clear in section 3.4.3 of the PDF spec
-                        # that all cross-reference table lines are a fixed
-                        # 20 bytes (as of PDF 1.7). However, some files have
-                        # 21-byte entries (or more) due to the use of \r\n
-                        # (CRLF) EOL's. Detect that case, and adjust the line
-                        # until it does not begin with a \r (CR) or \n (LF).
-                        while line[0] in b_("\x0D\x0A"):
-                            stream.seek(-20 + 1, 1)
-                            line = stream.read(20)
-
-                        # On the other hand, some malformed PDF files
-                        # use a single character EOL without a preceeding
-                        # space.  Detect that case, and seek the stream
-                        # back one character.  (0-9 means we've bled into
-                        # the next xref entry, t means we've bled into the
-                        # text "trailer"):
-                        if line[-1] in b_("0123456789t"):
-                            stream.seek(-1, 1)
-
-                        offset, generation = line[:16].split(b_(" "))
-                        offset, generation = int(offset), int(generation)
-                        if generation not in self.xref:
-                            self.xref[generation] = {}
-                        if num in self.xref[generation]:
-                            # It really seems like we should allow the last
-                            # xref table in the file to override previous
-                            # ones. Since we read the file backwards, assume
-                            # any existing key is already set correctly.
-                            pass
-                        else:
-                            self.xref[generation][num] = offset
-                        cnt += 1
-                        num += 1
-                    readNonWhitespace(stream)
-                    stream.seek(-1, 1)
-                    trailertag = stream.read(7)
-                    if trailertag != b_("trailer"):
-                        # more xrefs!
-                        stream.seek(-7, 1)
-                    else:
-                        break
+                self._read_standard_xref_table(stream)
                 readNonWhitespace(stream)
                 stream.seek(-1, 1)
                 newTrailer = readObject(stream, self)
@@ -1951,65 +1866,11 @@ def read(self, stream):
                     startxref = newTrailer["/Prev"]
                 else:
                     break
-            elif rebuildXrefTable:
-                self.xref={}
-                stream.seek(0,0)
-                f_ = stream.read(-1)
-                import re
-                for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
-                    idnum = int(m.group(1))
-                    generation = int(m.group(2))
-                    if generation not in self.xref:
-                        self.xref[generation] = {}
-                    self.xref[generation][idnum] = m.start(1)
-                trailerPos = f_.rfind(b"trailer") - len(f_) + 7
-                stream.seek(trailerPos,2)
-                #code below duplicated
-                readNonWhitespace(stream)
-                stream.seek(-1, 1)
-                newTrailer = readObject(stream, self)
-                for key, value in list(newTrailer.items()):
-                    if key not in self.trailer:
-                        self.trailer[key] = value
-                #if "/Prev" in newTrailer:
-                #    startxref = newTrailer["/Prev"]
-                #else:
+            elif xref_issue_nr:
+                self._rebuild_xref_table(stream)
                 break
             elif x.isdigit():
-                # PDF 1.5+ Cross-Reference Stream
-                stream.seek(-1, 1)
-                idnum, generation = self.readObjectHeader(stream)
-                xrefstream = readObject(stream, self)
-                assert xrefstream["/Type"] == "/XRef"
-                self.cacheIndirectObject(generation, idnum, xrefstream)
-                streamData = BytesIO(b_(xrefstream.getData()))
-                # Index pairs specify the subsections in the dictionary. If
-                # none create one subsection that spans everything.
-                idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
-                entrySizes = xrefstream.get("/W")
-                assert len(entrySizes) >= 3
-                if self.strict and len(entrySizes) > 3:
-                    raise PdfReadError("Too many entry sizes: %s" %entrySizes)
-
-                def getEntry(i):
-                    # Reads the correct number of bytes for each entry. See the
-                    # discussion of the W parameter in PDF spec table 17.
-                    if entrySizes[i] > 0:
-                        d = streamData.read(entrySizes[i])
-                        return convertToInt(d, entrySizes[i])
-
-                    # PDF Spec Table 17: A value of zero for an element in the
-                    # W array indicates...the default value shall be used
-                    if i == 0:  return 1 # First value defaults to 1
-                    else:       return 0
-
-                def used_before(num, generation):
-                    # We move backwards through the xrefs, don't replace any.
-                    return num in self.xref.get(generation, []) or \
-                            num in self.xref_objStm
-
-                # Iterate through each subsection
-                self._read_xref_subsections(idx_pairs, getEntry, used_before)
+                xrefstream = self._read_pdf15_xref_stream(stream)
 
                 trailerKeys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID
                 for key in trailerKeys:
@@ -2071,6 +1932,162 @@ def used_before(num, generation):
                     # if not, then either it's just plain wrong, or the non-zero-index is actually correct
             stream.seek(loc, 0) # return to where it was
 
+    def _read_standard_xref_table(self, stream):
+        # standard cross-reference table
+        ref = stream.read(4)
+        if ref[:3] != b_("ref"):
+            raise PdfReadError("xref table read error")
+        readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        firsttime = True  # check if the first time looking at the xref table
+        while True:
+            num = readObject(stream, self)
+            if firsttime and num != 0:
+                self.xrefIndex = num
+                if self.strict:
+                    warnings.warn(
+                        "Xref table not zero-indexed. ID numbers for objects will be corrected.",
+                        PdfReadWarning,
+                    )
+                    # if table not zero indexed, could be due to error from when PDF was created
+                    # which will lead to mismatched indices later on, only warned and corrected if self.strict=True
+            firsttime = False
+            readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            size = readObject(stream, self)
+            readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            cnt = 0
+            while cnt < size:
+                line = stream.read(20)
+
+                # It's very clear in section 3.4.3 of the PDF spec
+                # that all cross-reference table lines are a fixed
+                # 20 bytes (as of PDF 1.7). However, some files have
+                # 21-byte entries (or more) due to the use of \r\n
+                # (CRLF) EOL's. Detect that case, and adjust the line
+                # until it does not begin with a \r (CR) or \n (LF).
+                while line[0] in b_("\x0D\x0A"):
+                    stream.seek(-20 + 1, 1)
+                    line = stream.read(20)
+
+                # On the other hand, some malformed PDF files
+                # use a single character EOL without a preceeding
+                # space.  Detect that case, and seek the stream
+                # back one character.  (0-9 means we've bled into
+                # the next xref entry, t means we've bled into the
+                # text "trailer"):
+                if line[-1] in b_("0123456789t"):
+                    stream.seek(-1, 1)
+
+                offset, generation = line[:16].split(b_(" "))
+                offset, generation = int(offset), int(generation)
+                if generation not in self.xref:
+                    self.xref[generation] = {}
+                if num in self.xref[generation]:
+                    # It really seems like we should allow the last
+                    # xref table in the file to override previous
+                    # ones. Since we read the file backwards, assume
+                    # any existing key is already set correctly.
+                    pass
+                else:
+                    self.xref[generation][num] = offset
+                cnt += 1
+                num += 1
+            readNonWhitespace(stream)
+            stream.seek(-1, 1)
+            trailertag = stream.read(7)
+            if trailertag != b_("trailer"):
+                # more xrefs!
+                stream.seek(-7, 1)
+            else:
+                break
+
+    def _read_pdf15_xref_stream(self, stream):
+        # PDF 1.5+ Cross-Reference Stream
+        stream.seek(-1, 1)
+        idnum, generation = self.readObjectHeader(stream)
+        xrefstream = readObject(stream, self)
+        assert xrefstream["/Type"] == "/XRef"
+        self.cacheIndirectObject(generation, idnum, xrefstream)
+        streamData = BytesIO(b_(xrefstream.getData()))
+        # Index pairs specify the subsections in the dictionary. If
+        # none create one subsection that spans everything.
+        idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
+        entrySizes = xrefstream.get("/W")
+        assert len(entrySizes) >= 3
+        if self.strict and len(entrySizes) > 3:
+            raise PdfReadError("Too many entry sizes: %s" % entrySizes)
+
+        def getEntry(i):
+            # Reads the correct number of bytes for each entry. See the
+            # discussion of the W parameter in PDF spec table 17.
+            if entrySizes[i] > 0:
+                d = streamData.read(entrySizes[i])
+                return convertToInt(d, entrySizes[i])
+
+            # PDF Spec Table 17: A value of zero for an element in the
+            # W array indicates...the default value shall be used
+            if i == 0:
+                return 1  # First value defaults to 1
+            else:
+                return 0
+
+        def used_before(num, generation):
+            # We move backwards through the xrefs, don't replace any.
+            return num in self.xref.get(generation, []) or num in self.xref_objStm
+
+        # Iterate through each subsection
+        self._read_xref_subsections(idx_pairs, getEntry, used_before)
+        return xrefstream
+
+    @staticmethod
+    def _get_xref_issues(stream, startxref):
+        """Returns an int which indicates an issue. 0 means there is no issue."""
+        stream.seek(startxref - 1, 0)  # -1 to check character before
+        line = stream.read(1)
+        if line not in b_("\r\n \t"):
+            return 1
+        line = stream.read(4)
+        if line != b_("xref"):
+            # not an xref so check if it is an XREF object
+            line = b_("")
+            while line in b_("0123456789 \t"):
+                line = stream.read(1)
+                if line == b_(""):
+                    return 2
+            line += stream.read(2)  # 1 char already read, +2 to check "obj"
+            if line.lower() != b_("obj"):
+                return 3
+            while stream.read(1) in b_(" \t\r\n"):
+                pass
+            line = stream.read(256)  # check that it is xref obj
+            if b_("/xref") not in line.lower():
+                return 4
+        return 0
+
+    def _rebuild_xref_table(self, stream):
+        self.xref = {}
+        stream.seek(0, 0)
+        f_ = stream.read(-1)
+        import re
+
+        for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_):
+            idnum = int(m.group(1))
+            generation = int(m.group(2))
+            if generation not in self.xref:
+                self.xref[generation] = {}
+            self.xref[generation][idnum] = m.start(1)
+        trailerPos = f_.rfind(b"trailer") - len(f_) + 7
+        stream.seek(trailerPos, 2)
+        # code below duplicated
+        readNonWhitespace(stream)
+        stream.seek(-1, 1)
+        newTrailer = readObject(stream, self)
+        for key, value in list(newTrailer.items()):
+            if key not in self.trailer:
+                self.trailer[key] = value
+
     def _read_xref_subsections(self, idx_pairs, getEntry, used_before):
         last_end = 0
         for start, size in self._pairs(idx_pairs):

diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -9,7 +9,7 @@
 from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
 from PyPDF2.constants import Ressources as RES
-from PyPDF2.errors import PdfReadError, PdfReadWarning
+from PyPDF2.errors import PdfReadError
 from PyPDF2.filters import _xobj_to_image
 
 if version_info < (3, 0):
@@ -229,14 +229,12 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
     )
     pdf_stream = io.BytesIO(pdf_data)
     if should_fail:
-        with pytest.raises(Exception) as exc:
+        with pytest.raises(PdfReadError) as exc:
             PdfFileReader(pdf_stream, strict=strict)
-        if startx_correction != -1:
-            assert exc.type == PdfReadWarning
-        else:
+        assert exc.type == PdfReadError
+        if startx_correction == -1:
             assert (
-                exc.type == PdfReadError
-                and exc.value.args[0]
+                exc.value.args[0]
                 == "/Prev=0 in the trailer (try opening with strict=False)"
             )
     else:
@@ -245,10 +243,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
 
 def test_issue297():
     path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
-    with pytest.raises(PdfReadWarning) as exc:
+    with pytest.raises(PdfReadError) as exc:
         reader = PdfFileReader(path, strict=True)
         reader.getPage(0)
-    assert "startxref" in exc.value.args[0]
+    assert "Broken xref table" in exc.value.args[0]
     reader = PdfFileReader(path, strict=False)
     reader.getPage(0)