Skip to content

Commit

Permalink
ROB: Fix corruption in startxref or xref table (#788)
Browse files Browse the repository at this point in the history
Use PdfReadWarning instead of UserWarning to be consistent

Closes #297
  • Loading branch information
pubpub-zz authored Apr 27, 2022
1 parent 35086b6 commit 904b0df
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 23 deletions.
53 changes: 53 additions & 0 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1837,6 +1837,35 @@ def read(self, stream):
if line[:9] != b_("startxref"):
raise PdfReadError("startxref not found")

#check and eventually correct the startxref only in not strict
rebuildXrefTable = False
try:
stream.seek(startxref - 1,0) #-1 to check character before
line=stream.read(1)
if line not in b_("\r\n \t"):
raise PdfReadWarning("incorrect startxref pointer(1)",line)
line = stream.read(4)
if line != b_("xref"):
#not an xref so check if it is an XREF object
line = b_("")
while line in b_("0123456789 \t"):
line = stream.read(1)
if line == b_(""):
raise PdfReadWarning("incorrect startxref pointer(2)")
line += stream.read(2) #1 char already read, +2 to check "obj"
if line.lower() != b_("obj"):
raise PdfReadWarning("incorrect startxref pointer(3)")
while stream.read(1) in b_(" \t\r\n"):
pass;
line=stream.read(256) # check that it is xref obj
if b_("/xref") not in line.lower():
raise PdfReadWarning("incorrect startxref pointer(4)")
except PdfReadWarning as e:
warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
if( not self.strict):
rebuildXrefTable = True
else:
raise
# read all cross reference tables and their trailers
self.xref = {}
self.xref_objStm = {}
Expand Down Expand Up @@ -1922,6 +1951,30 @@ def read(self, stream):
startxref = newTrailer["/Prev"]
else:
break
elif rebuildXrefTable:
self.xref={}
stream.seek(0,0)
f_ = stream.read(-1)
import re
for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
idnum = int(m.group(1))
generation = int(m.group(2))
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailerPos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailerPos,2)
#code below duplicated
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in list(newTrailer.items()):
if key not in self.trailer:
self.trailer[key] = value
#if "/Prev" in newTrailer:
# startxref = newTrailer["/Prev"]
#else:
break
elif x.isdigit():
# PDF 1.5+ Cross-Reference Stream
stream.seek(-1, 1)
Expand Down
53 changes: 30 additions & 23 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.filters import _xobj_to_image

if version_info < (3, 0):
Expand Down Expand Up @@ -184,15 +184,19 @@ def test_get_images(src, nb_images):


@pytest.mark.parametrize(
"strict,with_prev_0,should_fail",
"strict,with_prev_0,startx_correction,should_fail",
[
(True, True, True),
(True, False, False),
(False, True, False),
(False, False, False),
(True, False, -1, False), # all nominal => no fail
(True, True, -1, True), # Prev=0 => fail expected
(False, False, -1, False),
(False, True, -1, False), # Prev =0 => no strict so tolerant
(True, False, 0, True), # error on startxref, in strict => fail expected
(True, True, 0, True),
(False, False, 0, False), # error on startxref, but no strict => xref rebuilt,no fail
(False, True, 0, False),
],
)
def test_get_images_raw(strict, with_prev_0, should_fail):
def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
Expand Down Expand Up @@ -220,29 +224,32 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref"),
# startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
pdf_data.find(b"xref") + startx_correction,
)
pdf_stream = io.BytesIO(pdf_data)
if should_fail:
with pytest.raises(PdfReadError) as exc:
with pytest.raises(Exception) as exc:
PdfFileReader(pdf_stream, strict=strict)
assert (
exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
if startx_correction != -1:
assert exc.type == PdfReadWarning
else:
assert (
exc.type == PdfReadError
and exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
else:
PdfFileReader(pdf_stream, strict=strict)


@pytest.mark.xfail(
reason=(
"It's still broken - and unclear what the issue is. "
"Help would be appreciated!"
)
)
def test_issue297():
path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
reader = PdfFileReader(path, "rb")
with pytest.raises(PdfReadWarning) as exc:
reader = PdfFileReader(path, strict=True)
reader.getPage(0)
assert "startxref" in exc.value.args[0]
reader = PdfFileReader(path, strict=False)
reader.getPage(0)


Expand Down Expand Up @@ -384,7 +391,7 @@ def test_read_prev_0_trailer():
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref"),
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
Expand Down Expand Up @@ -419,7 +426,7 @@ def test_read_missing_startxref():
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
# pdf_data.find(b"xref"),
# pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
Expand Down Expand Up @@ -455,7 +462,7 @@ def test_read_unknown_zero_pages():
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
pdf_data.find(b"xref"),
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
Expand Down

0 comments on commit 904b0df

Please sign in to comment.