From 699e1ad0fbdcf28396163f8ae47d0158c6f6eb27 Mon Sep 17 00:00:00 2001 From: dsk7 Date: Sat, 23 Apr 2022 17:39:18 +0200 Subject: [PATCH] BUG: fix pdf reader getting stuck when trying to read large files without xref marker --- PyPDF2/pdf.py | 8 ++++---- Tests/test_reader.py | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 035eeb223..a6e361edb 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -2082,7 +2082,7 @@ def _pairs(self, array): def readNextEndLine(self, stream, limit_offset=0): debug = False if debug: print(">>readNextEndLine") - line = b_("") + line_parts = [] while True: # Prevent infinite loops in malformed PDFs if stream.tell() == 0 or stream.tell() == limit_offset: @@ -2109,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0): break else: if debug: print(" x is neither") - line = x + line - if debug: print((" RNEL line:", line)) + line_parts.append(x) if debug: print("leaving RNEL") - return line + line_parts.reverse() + return b"".join(line_parts) def decrypt(self, password): """ diff --git a/Tests/test_reader.py b/Tests/test_reader.py index d38b32ab9..5230ee02e 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -1,5 +1,6 @@ import io import os +import time import pytest @@ -9,6 +10,14 @@ from PyPDF2.constants import Ressources as RES from PyPDF2.errors import PdfReadError from PyPDF2.filters import _xobj_to_image +from sys import version_info + +if version_info < ( 3, 0 ): + from cStringIO import StringIO + StreamIO = StringIO +else: + from io import BytesIO + StreamIO = BytesIO TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -462,3 +471,16 @@ def test_get_destination_age_number(): for outline in outlines: if not isinstance(outline, list): reader.getDestinationPageNumber(outline) + + +def test_do_not_get_stuck_on_large_files_without_start_xref(): + """Tests for the absence of a DoS bug, where a large file without an startxref mark + would cause the library to hang for minutes to hours """ + start_time = time.time() + broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000) + with pytest.raises(PdfReadError): + PdfFileReader(broken_stream) + parse_duration = time.time() - start_time + # parsing is expected take less than a second on a modern cpu, but include a large + # tolerance to account for busy or slow systems + assert parse_duration < 60