PI: Optimize read_next_end_line (#646)

read_next_end_line is inefficient when handling long lines. If the stream is a buffered binary stream, each one-byte "backwards" read may trigger a full buffer read, and (more significantly) iteratively building the line we want to return is quadratic in its total length.
py-pdf · Jun 9, 2022 · 8cd0cfe · 8cd0cfe
1 parent 332fa5e
commit 8cd0cfe
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 49 deletions.
diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py
@@ -31,6 +31,7 @@
 import struct
 import warnings
 from hashlib import md5
+import os
 from io import BytesIO
 from typing import (
     Any,
@@ -55,6 +56,7 @@
     ord_,
     read_non_whitespace,
     read_until_whitespace,
+    read_previous_line,
     skip_over_comment,
     skip_over_whitespace,
 )
@@ -1202,25 +1204,25 @@ def cacheIndirectObject(
 
     def read(self, stream: StreamType) -> None:
         # start at the end:
-        stream.seek(-1, 2)
+        stream.seek(0, os.SEEK_END)
         if not stream.tell():
             raise PdfReadError("Cannot read an empty file")
         if self.strict:
-            stream.seek(0, 0)
+            stream.seek(0, os.SEEK_SET)
             header_byte = stream.read(5)
             if header_byte != b"%PDF-":
                 raise PdfReadError(
                     "PDF starts with '{}', but '%PDF-' expected".format(
                         header_byte.decode("utf8")
                     )
                 )
-            stream.seek(-1, 2)
+            stream.seek(0, os.SEEK_END)
         last_mb = stream.tell() - 1024 * 1024 + 1  # offset of last MB of stream
         line = b_("")
         while line[:5] != b_("%%EOF"):
             if stream.tell() < last_mb:
                 raise PdfReadError("EOF marker not found")
-            line = self.read_next_end_line(stream)
+            line = read_previous_line(stream)
 
         startxref = self._find_startxref_pos(stream)
 
@@ -1327,7 +1329,7 @@ def read(self, stream: StreamType) -> None:
 
     def _find_startxref_pos(self, stream: StreamType) -> int:
         """Find startxref entry - the location of the xref table"""
-        line = self.read_next_end_line(stream)
+        line = read_previous_line(stream)
         try:
             startxref = int(line)
         except ValueError:
@@ -1337,7 +1339,7 @@ def _find_startxref_pos(self, stream: StreamType) -> int:
             startxref = int(line[9:].strip())
             warnings.warn("startxref on same line as offset", PdfReadWarning)
         else:
-            line = self.read_next_end_line(stream)
+            line = read_previous_line(stream)
             if line[:9] != b_("startxref"):
                 raise PdfReadError("startxref not found")
         return startxref
@@ -1552,46 +1554,6 @@ def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]:
             if (i + 1) >= len(array):
                 break
 
-    def read_next_end_line(self, stream: StreamType, limit_offset: int = 0) -> bytes:
-        line_parts = []
-        while True:
-            # Prevent infinite loops in malformed PDFs
-            if stream.tell() == 0 or stream.tell() == limit_offset:
-                raise PdfReadError("Could not read malformed PDF file")
-            x = stream.read(1)
-            if stream.tell() < 2:
-                raise PdfReadError("EOL marker not found")
-            stream.seek(-2, 1)
-            if x == b_("\n") or x == b_("\r"):  # \n = LF; \r = CR
-                crlf = False
-                while x == b_("\n") or x == b_("\r"):
-                    x = stream.read(1)
-                    if x == b_("\n") or x == b_("\r"):  # account for CR+LF
-                        stream.seek(-1, 1)
-                        crlf = True
-                    if stream.tell() < 2:
-                        raise PdfReadError("EOL marker not found")
-                    stream.seek(-2, 1)
-                stream.seek(
-                    2 if crlf else 1, 1
-                )  # if using CR+LF, go back 2 bytes, else 1
-                break
-            else:
-                line_parts.append(x)
-        line_parts.reverse()
-        return b"".join(line_parts)
-
-    def readNextEndLine(
-        self, stream: StreamType, limit_offset: int = 0
-    ) -> bytes:  # pragma: no cover
-        """
-        .. deprecated:: 1.28.0
-
-            Use :meth:`read_next_end_line` instead.
-        """
-        deprecate_with_replacement("readNextEndLine", "read_next_end_line")
-        return self.read_next_end_line(stream, limit_offset)
-
     def decrypt(self, password: Union[str, bytes]) -> int:
         """
         When using an encrypted / secured PDF file with the PDF Standard

diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py
@@ -33,7 +33,8 @@
 
 import warnings
 from codecs import getencoder
-from io import BufferedReader, BufferedWriter, BytesIO, FileIO
+from io import BufferedReader, BufferedWriter, BytesIO, FileIO, DEFAULT_BUFFER_SIZE
+import os
 from typing import Any, Dict, Optional, Tuple, Union, overload
 
 try:
@@ -131,6 +132,75 @@ def read_until_regex(stream: StreamType, regex: Any, ignore_eof: bool = False) -
     return name
 
 
+CRLF = b'\r\n'
+
+
+def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
+    """Given a stream at position X, read a block of size
+    to_read ending at position X.
+    The stream's position should be unchanged.
+    """
+    if stream.tell() < to_read:
+        raise PdfStreamError('Could not read malformed PDF file')
+    # Seek to the start of the block we want to read.
+    stream.seek(-to_read, os.SEEK_CUR)
+    read = stream.read(to_read)
+    # Seek to the start of the block we read after reading it.
+    stream.seek(-to_read, os.SEEK_CUR)
+    if len(read) != to_read:
+        raise PdfStreamError('EOF: read %s, expected %s?' % (len(read), to_read))
+    return read
+
+
+def read_previous_line(stream: StreamType) -> bytes:
+    """Given a byte stream with current position X, return the previous
+    line - all characters between the first CR/LF byte found before X
+    (or, the start of the file, if no such byte is found) and position X
+    After this call, the stream will be positioned one byte after the
+    first non-CRLF character found beyond the first CR/LF byte before X,
+    or, if no such byte is found, at the beginning of the stream.
+    """
+    line_content = []
+    found_crlf = False
+    if stream.tell() == 0:
+        raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
+    while True:
+        to_read = min(DEFAULT_BUFFER_SIZE, stream.tell())
+        if to_read == 0:
+            break
+        # Read the block. After this, our stream will be one
+        # beyond the initial position.
+        block = read_block_backwards(stream, to_read)
+        idx = len(block) - 1
+        if not found_crlf:
+            # We haven't found our first CR/LF yet.
+            # Read off characters until we hit one.
+            while idx >= 0 and block[idx] not in CRLF:
+                idx -= 1
+            if idx >= 0:
+                found_crlf = True
+        if found_crlf:
+            # We found our first CR/LF already (on this block or
+            # a previous one).
+            # Our combined line is the remainder of the block
+            # plus any previously read blocks.
+            line_content.append(block[idx + 1:])
+            # Continue to read off any more CRLF characters.
+            while idx >= 0 and block[idx] in CRLF:
+                idx -= 1
+        else:
+            # Didn't find CR/LF yet - add this block to our
+            # previously read blocks and continue.
+            line_content.append(block)
+        if idx >= 0:
+            # We found the next non-CRLF character.
+            # Set the stream position correctly, then break
+            stream.seek(idx + 1, os.SEEK_CUR)
+            break
+    # Join all the blocks in the line (which are in reverse order)
+    return b''.join(line_content[::-1])
+
+
 def matrix_multiply(
     a: TransformationMatrixType, b: TransformationMatrixType
 ) -> TransformationMatrixType:

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -10,7 +10,7 @@
 from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
 from PyPDF2.constants import Ressources as RES
-from PyPDF2.errors import PdfReadError, PdfReadWarning
+from PyPDF2.errors import PdfReadError, PdfReadWarning, STREAM_TRUNCATED_PREMATURELY
 from PyPDF2.filters import _xobj_to_image
 from tests import get_pdf_from_url
 
@@ -400,7 +400,7 @@ def test_read_malformed_header():
 def test_read_malformed_body():
     with pytest.raises(PdfReadError) as exc:
         PdfReader(io.BytesIO(b"%PDF-"), strict=True)
-    assert exc.value.args[0] == "Could not read malformed PDF file"
+    assert exc.value.args[0] == STREAM_TRUNCATED_PREMATURELY
 
 
 def test_read_prev_0_trailer():

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -11,6 +11,8 @@
     read_until_whitespace,
     skip_over_comment,
     skip_over_whitespace,
+    read_block_backwards,
+    read_previous_line
 )
 from PyPDF2.errors import PdfStreamError
 
@@ -121,3 +123,71 @@ def test_deprecate_no_replacement():
 )
 def test_paeth_predictor(left, up, upleft, expected):
     assert PyPDF2._utils.paeth_predictor(left, up, upleft) == expected
+
+
+@pytest.mark.parametrize(
+    ("dat", "pos", "to_read"),
+    [
+        (b'', 0, 1),
+        (b'a', 0, 1),
+        (b'abc', 0, 10),
+    ],
+)
+def test_read_block_backwards_errs(dat, pos, to_read):
+    with pytest.raises(PdfStreamError) as _:
+        s = io.BytesIO(dat)
+        s.seek(pos)
+        read_block_backwards(s, to_read)
+
+
+@pytest.mark.parametrize(
+    ("dat", "pos", "to_read", "expected", "expected_pos"),
+    [
+        (b'abc', 1, 0, b'', 1),
+        (b'abc', 1, 1, b'a', 0),
+        (b'abc', 2, 1, b'b', 1),
+        (b'abc', 2, 2, b'ab', 0),
+        (b'abc', 3, 1, b'c', 2),
+        (b'abc', 3, 2, b'bc', 1),
+        (b'abc', 3, 3, b'abc', 0),
+    ],
+)
+def test_read_block_backwards(dat, pos, to_read, expected, expected_pos):
+    s = io.BytesIO(dat)
+    s.seek(pos)
+    assert read_block_backwards(s, to_read) == expected
+    assert s.tell() == expected_pos
+
+
+def test_read_block_backwards_at_start():
+    s = io.BytesIO(b'abc')
+    with pytest.raises(PdfStreamError) as _:
+        read_previous_line(s)
+
+
+@pytest.mark.parametrize(
+    ("dat", "pos", "expected", "expected_pos"),
+    [
+        (b'abc', 1, b'a', 0),
+        (b'abc', 2, b'ab', 0),
+        (b'abc', 3, b'abc', 0),
+        (b'abc\n', 3, b'abc', 0),
+        (b'abc\n', 4, b'', 3),
+        (b'abc\n\r', 4, b'', 3),
+        (b'abc\nd', 5, b'd', 3),
+        # Skip over multiple CR/LF bytes
+        (b'abc\n\r\ndef', 9, b'def', 3),
+        # Include a block full of newlines...
+        (b'abc' + b'\n' * (2 * io.DEFAULT_BUFFER_SIZE) + b'd', 2 * io.DEFAULT_BUFFER_SIZE + 4, b'd', 3),
+        # Include a block full of non-newline characters
+        (b'abc\n' + b'd' * (2 * io.DEFAULT_BUFFER_SIZE), 2 * io.DEFAULT_BUFFER_SIZE + 4, b'd' * (2 * io.DEFAULT_BUFFER_SIZE), 3),
+        # Both
+        (b'abcxyz' + b'\n' * (2 * io.DEFAULT_BUFFER_SIZE) + b'd' * (2 * io.DEFAULT_BUFFER_SIZE),\
+            4 * io.DEFAULT_BUFFER_SIZE + 6, b'd' * (2 * io.DEFAULT_BUFFER_SIZE), 6),
+    ],
+)
+def test_read_previous_line(dat, pos, expected, expected_pos):
+    s = io.BytesIO(dat)
+    s.seek(pos)
+    assert read_previous_line(s) == expected
+    assert s.tell() == expected_pos