Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #297 : fix corruption in startxref or xref table #788

Merged
merged 6 commits into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1843,6 +1843,35 @@ def read(self, stream):
if line[:9] != b_("startxref"):
raise PdfReadError("startxref not found")

#check and eventually correct the startxref only in not strict
rebuildXrefTable = False
try:
stream.seek(startxref - 1,0) #-1 to check character before
line=stream.read(1)
if line not in b_("\r\n \t"):
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
raise PdfReadWarning("incorrect startxref pointer(1)",line)
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
line = stream.read(4)
if line != b_("xref"):
#not an xref so check if it is an XREF object
line = b_("")
while line in b_("0123456789 \t"):
line = stream.read(1)
if line == b_(""):
raise PdfReadWarning("incorrect startxref pointer(2)")
line += stream.read(2) #1 char already read, +2 to check "obj"
if line.lower() != b_("obj"):
raise PdfReadWarning("incorrect startxref pointer(3)")
while stream.read(1) in b_(" \t\r\n"):
pass;
line=stream.read(256) # check that it is xref obj
if b_("/xref") not in line.lower():
raise PdfReadWarning("incorrect startxref pointer(4)")
except PdfReadWarning as e:
warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning)
if( not self.strict):
rebuildXrefTable = True
else:
raise
# read all cross reference tables and their trailers
self.xref = {}
self.xref_objStm = {}
Expand Down Expand Up @@ -1928,6 +1957,30 @@ def read(self, stream):
startxref = newTrailer["/Prev"]
else:
break
elif rebuildXrefTable:
self.xref={}
stream.seek(0,0)
f_ = stream.read(-1)
import re
for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_):
idnum = int(m.group(1))
generation = int(m.group(2))
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailerPos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailerPos,2)
#code below duplicated
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in list(newTrailer.items()):
if key not in self.trailer:
self.trailer[key] = value
#if "/Prev" in newTrailer:
# startxref = newTrailer["/Prev"]
#else:
break
elif x.isdigit():
# PDF 1.5+ Cross-Reference Stream
stream.seek(-1, 1)
Expand Down
53 changes: 30 additions & 23 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.filters import _xobj_to_image

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -164,15 +164,19 @@ def test_get_images(src, nb_images):


@pytest.mark.parametrize(
"strict,with_prev_0,should_fail",
"strict,with_prev_0,startx_correction,should_fail",
[
(True, True, True),
(True, False, False),
(False, True, False),
(False, False, False),
(True, False, -1, False), # all nominal => no fail
(True, True, -1, True), # Prev=0 => fail expected
(False, False, -1, False),
(False, True, -1, False), # Prev =0 => no strict so tolerant
(True, False, 0, True), # error on startxref, in strict => fail expected
(True, True, 0, True),
(False, False, 0, False), # error on startxref, but no strict => xref rebuilt,no fail
(False, True, 0, False),
],
)
def test_get_images_raw(strict, with_prev_0, should_fail):
def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail):
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
Expand Down Expand Up @@ -200,29 +204,32 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref"),
# startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
pdf_data.find(b"xref") + startx_correction,
)
pdf_stream = io.BytesIO(pdf_data)
if should_fail:
with pytest.raises(PdfReadError) as exc:
with pytest.raises(Exception) as exc:
PdfFileReader(pdf_stream, strict=strict)
assert (
exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
if startx_correction != -1:
assert exc.type == PdfReadWarning
else:
assert (
exc.type == PdfReadError
and exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
else:
PdfFileReader(pdf_stream, strict=strict)


@pytest.mark.xfail(
reason=(
"It's still broken - and unclear what the issue is. "
"Help would be appreciated!"
)
)
def test_issue297():
path = os.path.join(RESOURCE_ROOT, "issue-297.pdf")
reader = PdfFileReader(path, "rb")
with pytest.raises(PdfReadWarning) as exc:
reader = PdfFileReader(path, strict=True)
reader.getPage(0)
assert "startxref" in exc.value.args[0]
reader = PdfFileReader(path, strict=False)
reader.getPage(0)


Expand Down Expand Up @@ -353,7 +360,7 @@ def test_read_prev_0_trailer():
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref"),
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
Expand Down Expand Up @@ -388,7 +395,7 @@ def test_read_missing_startxref():
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
# pdf_data.find(b"xref"),
# pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
Expand Down Expand Up @@ -424,7 +431,7 @@ def test_read_unknown_zero_pages():
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
pdf_data.find(b"xref"),
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
Expand Down