Skip to content

Commit

Permalink
PI: Optimize read_next_end_line (#646)
Browse files Browse the repository at this point in the history
read_next_end_line is inefficient when handling long lines. If the stream is a buffered binary stream, each one-byte "backwards" read may trigger a full buffer read, and (more significantly) iteratively building the line we want to return is quadratic in its total length.
  • Loading branch information
ztravis authored Jun 9, 2022
1 parent 332fa5e commit 8cd0cfe
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 49 deletions.
54 changes: 8 additions & 46 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import struct
import warnings
from hashlib import md5
import os
from io import BytesIO
from typing import (
Any,
Expand All @@ -55,6 +56,7 @@
ord_,
read_non_whitespace,
read_until_whitespace,
read_previous_line,
skip_over_comment,
skip_over_whitespace,
)
Expand Down Expand Up @@ -1202,25 +1204,25 @@ def cacheIndirectObject(

def read(self, stream: StreamType) -> None:
# start at the end:
stream.seek(-1, 2)
stream.seek(0, os.SEEK_END)
if not stream.tell():
raise PdfReadError("Cannot read an empty file")
if self.strict:
stream.seek(0, 0)
stream.seek(0, os.SEEK_SET)
header_byte = stream.read(5)
if header_byte != b"%PDF-":
raise PdfReadError(
"PDF starts with '{}', but '%PDF-' expected".format(
header_byte.decode("utf8")
)
)
stream.seek(-1, 2)
stream.seek(0, os.SEEK_END)
last_mb = stream.tell() - 1024 * 1024 + 1 # offset of last MB of stream
line = b_("")
while line[:5] != b_("%%EOF"):
if stream.tell() < last_mb:
raise PdfReadError("EOF marker not found")
line = self.read_next_end_line(stream)
line = read_previous_line(stream)

startxref = self._find_startxref_pos(stream)

Expand Down Expand Up @@ -1327,7 +1329,7 @@ def read(self, stream: StreamType) -> None:

def _find_startxref_pos(self, stream: StreamType) -> int:
"""Find startxref entry - the location of the xref table"""
line = self.read_next_end_line(stream)
line = read_previous_line(stream)
try:
startxref = int(line)
except ValueError:
Expand All @@ -1337,7 +1339,7 @@ def _find_startxref_pos(self, stream: StreamType) -> int:
startxref = int(line[9:].strip())
warnings.warn("startxref on same line as offset", PdfReadWarning)
else:
line = self.read_next_end_line(stream)
line = read_previous_line(stream)
if line[:9] != b_("startxref"):
raise PdfReadError("startxref not found")
return startxref
Expand Down Expand Up @@ -1552,46 +1554,6 @@ def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]:
if (i + 1) >= len(array):
break

def read_next_end_line(self, stream: StreamType, limit_offset: int = 0) -> bytes:
line_parts = []
while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0 or stream.tell() == limit_offset:
raise PdfReadError("Could not read malformed PDF file")
x = stream.read(1)
if stream.tell() < 2:
raise PdfReadError("EOL marker not found")
stream.seek(-2, 1)
if x == b_("\n") or x == b_("\r"): # \n = LF; \r = CR
crlf = False
while x == b_("\n") or x == b_("\r"):
x = stream.read(1)
if x == b_("\n") or x == b_("\r"): # account for CR+LF
stream.seek(-1, 1)
crlf = True
if stream.tell() < 2:
raise PdfReadError("EOL marker not found")
stream.seek(-2, 1)
stream.seek(
2 if crlf else 1, 1
) # if using CR+LF, go back 2 bytes, else 1
break
else:
line_parts.append(x)
line_parts.reverse()
return b"".join(line_parts)

def readNextEndLine(
self, stream: StreamType, limit_offset: int = 0
) -> bytes: # pragma: no cover
"""
.. deprecated:: 1.28.0
Use :meth:`read_next_end_line` instead.
"""
deprecate_with_replacement("readNextEndLine", "read_next_end_line")
return self.read_next_end_line(stream, limit_offset)

def decrypt(self, password: Union[str, bytes]) -> int:
"""
When using an encrypted / secured PDF file with the PDF Standard
Expand Down
72 changes: 71 additions & 1 deletion PyPDF2/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@

import warnings
from codecs import getencoder
from io import BufferedReader, BufferedWriter, BytesIO, FileIO
from io import BufferedReader, BufferedWriter, BytesIO, FileIO, DEFAULT_BUFFER_SIZE
import os
from typing import Any, Dict, Optional, Tuple, Union, overload

try:
Expand Down Expand Up @@ -131,6 +132,75 @@ def read_until_regex(stream: StreamType, regex: Any, ignore_eof: bool = False) -
return name


CRLF = b'\r\n'


def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
"""Given a stream at position X, read a block of size
to_read ending at position X.
The stream's position should be unchanged.
"""
if stream.tell() < to_read:
raise PdfStreamError('Could not read malformed PDF file')
# Seek to the start of the block we want to read.
stream.seek(-to_read, os.SEEK_CUR)
read = stream.read(to_read)
# Seek to the start of the block we read after reading it.
stream.seek(-to_read, os.SEEK_CUR)
if len(read) != to_read:
raise PdfStreamError('EOF: read %s, expected %s?' % (len(read), to_read))
return read


def read_previous_line(stream: StreamType) -> bytes:
"""Given a byte stream with current position X, return the previous
line - all characters between the first CR/LF byte found before X
(or, the start of the file, if no such byte is found) and position X
After this call, the stream will be positioned one byte after the
first non-CRLF character found beyond the first CR/LF byte before X,
or, if no such byte is found, at the beginning of the stream.
"""
line_content = []
found_crlf = False
if stream.tell() == 0:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
while True:
to_read = min(DEFAULT_BUFFER_SIZE, stream.tell())
if to_read == 0:
break
# Read the block. After this, our stream will be one
# beyond the initial position.
block = read_block_backwards(stream, to_read)
idx = len(block) - 1
if not found_crlf:
# We haven't found our first CR/LF yet.
# Read off characters until we hit one.
while idx >= 0 and block[idx] not in CRLF:
idx -= 1
if idx >= 0:
found_crlf = True
if found_crlf:
# We found our first CR/LF already (on this block or
# a previous one).
# Our combined line is the remainder of the block
# plus any previously read blocks.
line_content.append(block[idx + 1:])
# Continue to read off any more CRLF characters.
while idx >= 0 and block[idx] in CRLF:
idx -= 1
else:
# Didn't find CR/LF yet - add this block to our
# previously read blocks and continue.
line_content.append(block)
if idx >= 0:
# We found the next non-CRLF character.
# Set the stream position correctly, then break
stream.seek(idx + 1, os.SEEK_CUR)
break
# Join all the blocks in the line (which are in reverse order)
return b''.join(line_content[::-1])


def matrix_multiply(
a: TransformationMatrixType, b: TransformationMatrixType
) -> TransformationMatrixType:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.errors import PdfReadError, PdfReadWarning, STREAM_TRUNCATED_PREMATURELY
from PyPDF2.filters import _xobj_to_image
from tests import get_pdf_from_url

Expand Down Expand Up @@ -400,7 +400,7 @@ def test_read_malformed_header():
def test_read_malformed_body():
with pytest.raises(PdfReadError) as exc:
PdfReader(io.BytesIO(b"%PDF-"), strict=True)
assert exc.value.args[0] == "Could not read malformed PDF file"
assert exc.value.args[0] == STREAM_TRUNCATED_PREMATURELY


def test_read_prev_0_trailer():
Expand Down
70 changes: 70 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
read_until_whitespace,
skip_over_comment,
skip_over_whitespace,
read_block_backwards,
read_previous_line
)
from PyPDF2.errors import PdfStreamError

Expand Down Expand Up @@ -121,3 +123,71 @@ def test_deprecate_no_replacement():
)
def test_paeth_predictor(left, up, upleft, expected):
assert PyPDF2._utils.paeth_predictor(left, up, upleft) == expected


@pytest.mark.parametrize(
("dat", "pos", "to_read"),
[
(b'', 0, 1),
(b'a', 0, 1),
(b'abc', 0, 10),
],
)
def test_read_block_backwards_errs(dat, pos, to_read):
with pytest.raises(PdfStreamError) as _:
s = io.BytesIO(dat)
s.seek(pos)
read_block_backwards(s, to_read)


@pytest.mark.parametrize(
("dat", "pos", "to_read", "expected", "expected_pos"),
[
(b'abc', 1, 0, b'', 1),
(b'abc', 1, 1, b'a', 0),
(b'abc', 2, 1, b'b', 1),
(b'abc', 2, 2, b'ab', 0),
(b'abc', 3, 1, b'c', 2),
(b'abc', 3, 2, b'bc', 1),
(b'abc', 3, 3, b'abc', 0),
],
)
def test_read_block_backwards(dat, pos, to_read, expected, expected_pos):
s = io.BytesIO(dat)
s.seek(pos)
assert read_block_backwards(s, to_read) == expected
assert s.tell() == expected_pos


def test_read_block_backwards_at_start():
s = io.BytesIO(b'abc')
with pytest.raises(PdfStreamError) as _:
read_previous_line(s)


@pytest.mark.parametrize(
("dat", "pos", "expected", "expected_pos"),
[
(b'abc', 1, b'a', 0),
(b'abc', 2, b'ab', 0),
(b'abc', 3, b'abc', 0),
(b'abc\n', 3, b'abc', 0),
(b'abc\n', 4, b'', 3),
(b'abc\n\r', 4, b'', 3),
(b'abc\nd', 5, b'd', 3),
# Skip over multiple CR/LF bytes
(b'abc\n\r\ndef', 9, b'def', 3),
# Include a block full of newlines...
(b'abc' + b'\n' * (2 * io.DEFAULT_BUFFER_SIZE) + b'd', 2 * io.DEFAULT_BUFFER_SIZE + 4, b'd', 3),
# Include a block full of non-newline characters
(b'abc\n' + b'd' * (2 * io.DEFAULT_BUFFER_SIZE), 2 * io.DEFAULT_BUFFER_SIZE + 4, b'd' * (2 * io.DEFAULT_BUFFER_SIZE), 3),
# Both
(b'abcxyz' + b'\n' * (2 * io.DEFAULT_BUFFER_SIZE) + b'd' * (2 * io.DEFAULT_BUFFER_SIZE),\
4 * io.DEFAULT_BUFFER_SIZE + 6, b'd' * (2 * io.DEFAULT_BUFFER_SIZE), 6),
],
)
def test_read_previous_line(dat, pos, expected, expected_pos):
s = io.BytesIO(dat)
s.seek(pos)
assert read_previous_line(s) == expected
assert s.tell() == expected_pos

0 comments on commit 8cd0cfe

Please sign in to comment.