ROB: ignore_eof everywhere for read_until_regex

This was initially motivated by `NumberObject.read_from_stream`, which was calling `read_until_regex` with the default value of `ignore_eof=False` and thus raising exceptions like: ``` PyPDF2.errors.PdfStreamError: Stream has ended unexpectedly ``` 431ba70 demonstrates a similar fix for `NameObject.read_from_stream`. From discussion in #1505, it was realized that the change to `NumberObject.read_from_stream` had now made ALL callers of `read_until_regex` pass `ignore_eof=True`. It's cleaner to remove the parameter entirely and change the default behaviour.
py-pdf · Dec 29, 2022 · cd70bae · cd70bae
1 parent cfed01f
commit cd70bae
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 19 deletions.
diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -141,23 +141,19 @@ def skip_over_comment(stream: StreamType) -> None:
             tok = stream.read(1)
 
 
-def read_until_regex(
-    stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False
-) -> bytes:
+def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
     """
     Read until the regular expression pattern matched (ignore the match).
+    Treats EOF on the underlying stream as the end of the token to be matched.
 
     :raises PdfStreamError: on premature end-of-file
-    :param bool ignore_eof: If true, ignore end-of-line and return immediately
     :param regex: re.Pattern
     """
     name = b""
     while True:
         tok = stream.read(16)
         if not tok:
-            if ignore_eof:
-                return name
-            raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
+            return name
         m = regex.search(tok)
         if m is not None:
             name += tok[: m.start()]

diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
@@ -605,7 +605,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject":  # PdfReader
         name = stream.read(1)
         if name != NameObject.surfix:
             raise PdfReadError("name read error")
-        name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True)
+        name += read_until_regex(stream, NameObject.delimiter_pattern)
         try:
             # Name objects should represent irregular characters
             # with a '#' followed by the symbol's hex number

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -921,7 +921,7 @@ def __parse_content_stream(self, stream: StreamType) -> None:
                 break
             stream.seek(-1, 1)
             if peek.isalpha() or peek in (b"'", b'"'):
-                operator = read_until_regex(stream, NameObject.delimiter_pattern, True)
+                operator = read_until_regex(stream, NameObject.delimiter_pattern)
                 if operator == b"BI":
                     # begin inline image - a completely different parsing
                     # mechanism is required, of course... thanks buddy...

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder):
     assert stream.read() == remainder
 
 
-def test_read_until_regex_premature_ending_raise():
-    import re
-
-    stream = io.BytesIO(b"")
-    with pytest.raises(PdfStreamError) as exc:
-        read_until_regex(stream, re.compile(b"."))
-    assert exc.value.args[0] == "Stream has ended unexpectedly"
-
-
 def test_read_until_regex_premature_ending_name():
     import re
 
     stream = io.BytesIO(b"")
-    assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b""
+    assert read_until_regex(stream, re.compile(b".")) == b""
 
 
 @pytest.mark.parametrize(