Skip to content

Commit

Permalink
BUG: fix pdf reader getting stuck when trying to read large files wit…
Browse files Browse the repository at this point in the history
…hout xref marker
  • Loading branch information
dsk7 committed Apr 23, 2022
1 parent 3d65938 commit 3be7ec7
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 7 deletions.
14 changes: 7 additions & 7 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,11 +1836,11 @@ def read(self, stream):
while line[:5] != b_("%%EOF"):
if stream.tell() < last1M:
raise PdfReadError("EOF marker not found")
line = self.readNextEndLine(stream)
line = self.readNextEndLine(stream, last1M)
if debug: print(" line:",line)

# find startxref entry - the location of the xref table
line = self.readNextEndLine(stream)
line = self.readNextEndLine(stream, last1M)
try:
startxref = int(line)
except ValueError:
Expand All @@ -1850,7 +1850,7 @@ def read(self, stream):
startxref = int(line[9:].strip())
warnings.warn("startxref on same line as offset")
else:
line = self.readNextEndLine(stream)
line = self.readNextEndLine(stream, last1M)
if line[:9] != b_("startxref"):
raise PdfReadError("startxref not found")

Expand Down Expand Up @@ -2082,7 +2082,7 @@ def _pairs(self, array):
def readNextEndLine(self, stream, limit_offset=0):
debug = False
if debug: print(">>readNextEndLine")
line = b_("")
line_parts = []
while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0 or stream.tell() == limit_offset:
Expand All @@ -2109,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0):
break
else:
if debug: print(" x is neither")
line = x + line
if debug: print((" RNEL line:", line))
line_parts.append(x)
if debug: print("leaving RNEL")
return line
line_parts.reverse()
return b"".join(line_parts)

def decrypt(self, password):
"""
Expand Down
22 changes: 22 additions & 0 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
import os
import time

import pytest

Expand All @@ -9,6 +10,14 @@
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError
from PyPDF2.filters import _xobj_to_image
from sys import version_info

if version_info < ( 3, 0 ):
from cStringIO import StringIO
StreamIO = StringIO
else:
from io import BytesIO
StreamIO = BytesIO

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
Expand Down Expand Up @@ -462,3 +471,16 @@ def test_get_destination_age_number():
for outline in outlines:
if not isinstance(outline, list):
reader.getDestinationPageNumber(outline)


def test_do_not_get_stuck_on_large_files_without_start_xref():
"""Tests for the absence of a DoS bug, where a large file without an startxref mark
would cause the library to hang for minutes to hours """
start_time = time.time()
broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
with pytest.raises(PdfReadError):
PdfFileReader(broken_stream)
parse_duration = time.time() - start_time
# parsing is expected take less than a second on a modern cpu, but include a large
# tolerance to account for busy or slow systems
assert parse_duration < 60

0 comments on commit 3be7ec7

Please sign in to comment.