Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix pdf reader getting stuck on large files without startxref marker #295

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1758,11 +1758,11 @@ def read(self, stream):
while line[:5] != b_("%%EOF"):
if stream.tell() < last1K:
raise utils.PdfReadError("EOF marker not found")
line = self.readNextEndLine(stream)
line = self.readNextEndLine(stream, maxLineLength=1024)
if debug: print(" line:",line)

# find startxref entry - the location of the xref table
line = self.readNextEndLine(stream)
line = self.readNextEndLine(stream, maxLineLength=1024)
try:
startxref = int(line)
except ValueError:
Expand All @@ -1772,7 +1772,7 @@ def read(self, stream):
startxref = int(line[9:].strip())
warnings.warn("startxref on same line as offset")
else:
line = self.readNextEndLine(stream)
line = self.readNextEndLine(stream, maxLineLength=1024)
if line[:9] != b_("startxref"):
raise utils.PdfReadError("startxref not found")

Expand Down Expand Up @@ -1991,7 +1991,10 @@ def _pairs(self, array):
if (i+1) >= len(array):
break

def readNextEndLine(self, stream):
def readNextEndLine(self, stream, maxLineLength=None):
'''
if maxLineLength is set, an PdfReadError will be raised if the line candidate is longer than this value
'''
debug = False
if debug: print(">>readNextEndLine")
line = b_("")
Expand Down Expand Up @@ -2023,6 +2026,8 @@ def readNextEndLine(self, stream):
if debug: print(" x is neither")
line = x + line
if debug: print((" RNEL line:", line))
if maxLineLength is not None and len(line) > maxLineLength:
raise utils.PdfReadError("EOL marker not found")
if debug: print("leaving RNEL")
return line

Expand Down
12 changes: 11 additions & 1 deletion Tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import sys
import unittest
import StringIO

from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2 import PdfFileReader, PdfFileWriter, utils


# Configure path environment
Expand Down Expand Up @@ -37,6 +38,15 @@ def test_PdfReaderFileLoad(self):
% (pdftext, ipdf_p1_text.encode('utf-8', errors='ignore')))


def test_PdfReaderDoesNotGetStuckOnLargeFilesWithoutStartxref(self):
'''Tests the absence of a "DOS"-kind of bug, where a large file without an startxref
will cause the library to hang'''
broken_stream = StringIO.StringIO(chr(0) * 10 * 1000 * 1000)

with self.assertRaises(utils.PdfReadError):
PdfFileReader(broken_stream)


class AddJsTestCase(unittest.TestCase):

def setUp(self):
Expand Down