Skip to content

Commit

Permalink
MAINT: Refactoring after #788 (#830)
Browse files Browse the repository at this point in the history
This refactoring aims at making maintenance easier:

1. Too long functions make it hard to grasp the overall behavior. Hence the _get_xref_issues function was split out
2. `_get_xref_issues` is made a static method of the PdfFileReader to show that it belongs to the reader, but doesn't require any of its attributes
3. `_get_xref_issues` makes use of an integer return value instead of raising + catching exceptions. 
4. `_rebuild_xref_table` was moved to a method for the same reason.
  • Loading branch information
MartinThoma committed Apr 28, 2022
1 parent 1df2e4b commit ebe03d3
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 161 deletions.
321 changes: 169 additions & 152 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1837,35 +1837,15 @@ def read(self, stream):
if line[:9] != b_("startxref"):
raise PdfReadError("startxref not found")

#check and eventually correct the startxref only in not strict
rebuildXrefTable = False
try:
stream.seek(startxref - 1,0) #-1 to check character before
line=stream.read(1)
if line not in b_("\r\n \t"):
raise PdfReadWarning("incorrect startxref pointer(1)",line)
line = stream.read(4)
if line != b_("xref"):
#not an xref so check if it is an XREF object
line = b_("")
while line in b_("0123456789 \t"):
line = stream.read(1)
if line == b_(""):
raise PdfReadWarning("incorrect startxref pointer(2)")
line += stream.read(2) #1 char already read, +2 to check "obj"
if line.lower() != b_("obj"):
raise PdfReadWarning("incorrect startxref pointer(3)")
while stream.read(1) in b_(" \t\r\n"):
pass;
line=stream.read(256) # check that it is xref obj
if b_("/xref") not in line.lower():
raise PdfReadWarning("incorrect startxref pointer(4)")
except PdfReadWarning as e:
warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
if( not self.strict):
rebuildXrefTable = True
else:
raise
# check and eventually correct the startxref only in not strict
xref_issue_nr = self._get_xref_issues(stream, startxref)
if self.strict and xref_issue_nr:
raise PdfReadError("Broken xref table")
else:
warnings.warn(
"incorrect startxref pointer({})".format(xref_issue_nr), PdfReadWarning
)

# read all cross reference tables and their trailers
self.xref = {}
self.xref_objStm = {}
Expand All @@ -1875,72 +1855,7 @@ def read(self, stream):
stream.seek(startxref, 0)
x = stream.read(1)
if x == b_("x"):
# standard cross-reference table
ref = stream.read(4)
if ref[:3] != b_("ref"):
raise PdfReadError("xref table read error")
readNonWhitespace(stream)
stream.seek(-1, 1)
firsttime = True; # check if the first time looking at the xref table
while True:
num = readObject(stream, self)
if firsttime and num != 0:
self.xrefIndex = num
if self.strict:
warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", PdfReadWarning)
# if table not zero indexed, could be due to error from when PDF was created
# which will lead to mismatched indices later on, only warned and corrected if self.strict=True
firsttime = False
readNonWhitespace(stream)
stream.seek(-1, 1)
size = readObject(stream, self)
readNonWhitespace(stream)
stream.seek(-1, 1)
cnt = 0
while cnt < size:
line = stream.read(20)

# It's very clear in section 3.4.3 of the PDF spec
# that all cross-reference table lines are a fixed
# 20 bytes (as of PDF 1.7). However, some files have
# 21-byte entries (or more) due to the use of \r\n
# (CRLF) EOL's. Detect that case, and adjust the line
# until it does not begin with a \r (CR) or \n (LF).
while line[0] in b_("\x0D\x0A"):
stream.seek(-20 + 1, 1)
line = stream.read(20)

# On the other hand, some malformed PDF files
# use a single character EOL without a preceeding
# space. Detect that case, and seek the stream
# back one character. (0-9 means we've bled into
# the next xref entry, t means we've bled into the
# text "trailer"):
if line[-1] in b_("0123456789t"):
stream.seek(-1, 1)

offset, generation = line[:16].split(b_(" "))
offset, generation = int(offset), int(generation)
if generation not in self.xref:
self.xref[generation] = {}
if num in self.xref[generation]:
# It really seems like we should allow the last
# xref table in the file to override previous
# ones. Since we read the file backwards, assume
# any existing key is already set correctly.
pass
else:
self.xref[generation][num] = offset
cnt += 1
num += 1
readNonWhitespace(stream)
stream.seek(-1, 1)
trailertag = stream.read(7)
if trailertag != b_("trailer"):
# more xrefs!
stream.seek(-7, 1)
else:
break
self._read_standard_xref_table(stream)
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
Expand All @@ -1951,65 +1866,11 @@ def read(self, stream):
startxref = newTrailer["/Prev"]
else:
break
elif rebuildXrefTable:
self.xref={}
stream.seek(0,0)
f_ = stream.read(-1)
import re
for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
idnum = int(m.group(1))
generation = int(m.group(2))
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailerPos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailerPos,2)
#code below duplicated
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in list(newTrailer.items()):
if key not in self.trailer:
self.trailer[key] = value
#if "/Prev" in newTrailer:
# startxref = newTrailer["/Prev"]
#else:
elif xref_issue_nr:
self._rebuild_xref_table(stream)
break
elif x.isdigit():
# PDF 1.5+ Cross-Reference Stream
stream.seek(-1, 1)
idnum, generation = self.readObjectHeader(stream)
xrefstream = readObject(stream, self)
assert xrefstream["/Type"] == "/XRef"
self.cacheIndirectObject(generation, idnum, xrefstream)
streamData = BytesIO(b_(xrefstream.getData()))
# Index pairs specify the subsections in the dictionary. If
# none create one subsection that spans everything.
idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
entrySizes = xrefstream.get("/W")
assert len(entrySizes) >= 3
if self.strict and len(entrySizes) > 3:
raise PdfReadError("Too many entry sizes: %s" %entrySizes)

def getEntry(i):
# Reads the correct number of bytes for each entry. See the
# discussion of the W parameter in PDF spec table 17.
if entrySizes[i] > 0:
d = streamData.read(entrySizes[i])
return convertToInt(d, entrySizes[i])

# PDF Spec Table 17: A value of zero for an element in the
# W array indicates...the default value shall be used
if i == 0: return 1 # First value defaults to 1
else: return 0

def used_before(num, generation):
# We move backwards through the xrefs, don't replace any.
return num in self.xref.get(generation, []) or \
num in self.xref_objStm

# Iterate through each subsection
self._read_xref_subsections(idx_pairs, getEntry, used_before)
xrefstream = self._read_pdf15_xref_stream(stream)

trailerKeys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID
for key in trailerKeys:
Expand Down Expand Up @@ -2071,6 +1932,162 @@ def used_before(num, generation):
# if not, then either it's just plain wrong, or the non-zero-index is actually correct
stream.seek(loc, 0) # return to where it was

def _read_standard_xref_table(self, stream):
# standard cross-reference table
ref = stream.read(4)
if ref[:3] != b_("ref"):
raise PdfReadError("xref table read error")
readNonWhitespace(stream)
stream.seek(-1, 1)
firsttime = True # check if the first time looking at the xref table
while True:
num = readObject(stream, self)
if firsttime and num != 0:
self.xrefIndex = num
if self.strict:
warnings.warn(
"Xref table not zero-indexed. ID numbers for objects will be corrected.",
PdfReadWarning,
)
# if table not zero indexed, could be due to error from when PDF was created
# which will lead to mismatched indices later on, only warned and corrected if self.strict=True
firsttime = False
readNonWhitespace(stream)
stream.seek(-1, 1)
size = readObject(stream, self)
readNonWhitespace(stream)
stream.seek(-1, 1)
cnt = 0
while cnt < size:
line = stream.read(20)

# It's very clear in section 3.4.3 of the PDF spec
# that all cross-reference table lines are a fixed
# 20 bytes (as of PDF 1.7). However, some files have
# 21-byte entries (or more) due to the use of \r\n
# (CRLF) EOL's. Detect that case, and adjust the line
# until it does not begin with a \r (CR) or \n (LF).
while line[0] in b_("\x0D\x0A"):
stream.seek(-20 + 1, 1)
line = stream.read(20)

# On the other hand, some malformed PDF files
# use a single character EOL without a preceeding
# space. Detect that case, and seek the stream
# back one character. (0-9 means we've bled into
# the next xref entry, t means we've bled into the
# text "trailer"):
if line[-1] in b_("0123456789t"):
stream.seek(-1, 1)

offset, generation = line[:16].split(b_(" "))
offset, generation = int(offset), int(generation)
if generation not in self.xref:
self.xref[generation] = {}
if num in self.xref[generation]:
# It really seems like we should allow the last
# xref table in the file to override previous
# ones. Since we read the file backwards, assume
# any existing key is already set correctly.
pass
else:
self.xref[generation][num] = offset
cnt += 1
num += 1
readNonWhitespace(stream)
stream.seek(-1, 1)
trailertag = stream.read(7)
if trailertag != b_("trailer"):
# more xrefs!
stream.seek(-7, 1)
else:
break

def _read_pdf15_xref_stream(self, stream):
# PDF 1.5+ Cross-Reference Stream
stream.seek(-1, 1)
idnum, generation = self.readObjectHeader(stream)
xrefstream = readObject(stream, self)
assert xrefstream["/Type"] == "/XRef"
self.cacheIndirectObject(generation, idnum, xrefstream)
streamData = BytesIO(b_(xrefstream.getData()))
# Index pairs specify the subsections in the dictionary. If
# none create one subsection that spans everything.
idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
entrySizes = xrefstream.get("/W")
assert len(entrySizes) >= 3
if self.strict and len(entrySizes) > 3:
raise PdfReadError("Too many entry sizes: %s" % entrySizes)

def getEntry(i):
# Reads the correct number of bytes for each entry. See the
# discussion of the W parameter in PDF spec table 17.
if entrySizes[i] > 0:
d = streamData.read(entrySizes[i])
return convertToInt(d, entrySizes[i])

# PDF Spec Table 17: A value of zero for an element in the
# W array indicates...the default value shall be used
if i == 0:
return 1 # First value defaults to 1
else:
return 0

def used_before(num, generation):
# We move backwards through the xrefs, don't replace any.
return num in self.xref.get(generation, []) or num in self.xref_objStm

# Iterate through each subsection
self._read_xref_subsections(idx_pairs, getEntry, used_before)
return xrefstream

@staticmethod
def _get_xref_issues(stream, startxref):
"""Returns an int which indicates an issue. 0 means there is no issue."""
stream.seek(startxref - 1, 0) # -1 to check character before
line = stream.read(1)
if line not in b_("\r\n \t"):
return 1
line = stream.read(4)
if line != b_("xref"):
# not an xref so check if it is an XREF object
line = b_("")
while line in b_("0123456789 \t"):
line = stream.read(1)
if line == b_(""):
return 2
line += stream.read(2) # 1 char already read, +2 to check "obj"
if line.lower() != b_("obj"):
return 3
while stream.read(1) in b_(" \t\r\n"):
pass
line = stream.read(256) # check that it is xref obj
if b_("/xref") not in line.lower():
return 4
return 0

def _rebuild_xref_table(self, stream):
self.xref = {}
stream.seek(0, 0)
f_ = stream.read(-1)
import re

for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_):
idnum = int(m.group(1))
generation = int(m.group(2))
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailerPos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailerPos, 2)
# code below duplicated
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in list(newTrailer.items()):
if key not in self.trailer:
self.trailer[key] = value

def _read_xref_subsections(self, idx_pairs, getEntry, used_before):
last_end = 0
for start, size in self._pairs(idx_pairs):
Expand Down
16 changes: 7 additions & 9 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.errors import PdfReadError
from PyPDF2.filters import _xobj_to_image

if version_info < (3, 0):
Expand Down Expand Up @@ -229,14 +229,12 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
)
pdf_stream = io.BytesIO(pdf_data)
if should_fail:
with pytest.raises(Exception) as exc:
with pytest.raises(PdfReadError) as exc:
PdfFileReader(pdf_stream, strict=strict)
if startx_correction != -1:
assert exc.type == PdfReadWarning
else:
assert exc.type == PdfReadError
if startx_correction == -1:
assert (
exc.type == PdfReadError
and exc.value.args[0]
exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
else:
Expand All @@ -245,10 +243,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):

def test_issue297():
path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
with pytest.raises(PdfReadWarning) as exc:
with pytest.raises(PdfReadError) as exc:
reader = PdfFileReader(path, strict=True)
reader.getPage(0)
assert "startxref" in exc.value.args[0]
assert "Broken xref table" in exc.value.args[0]
reader = PdfFileReader(path, strict=False)
reader.getPage(0)

Expand Down

0 comments on commit ebe03d3

Please sign in to comment.