Skip to content

Commit

Permalink
ROB: ignore_eof everywhere for read_until_regex
Browse files Browse the repository at this point in the history
This was initially motivated by `NumberObject.read_from_stream`, which
was calling `read_until_regex` with the default value of
`ignore_eof=False` and thus raising exceptions like:

```
PyPDF2.errors.PdfStreamError: Stream has ended unexpectedly
```

431ba70
demonstrates a similar fix for `NameObject.read_from_stream`.

From discussion in #1505, it was
realized that the change to `NumberObject.read_from_stream` had now made
ALL callers of `read_until_regex` pass `ignore_eof=True`. It's cleaner
to remove the parameter entirely and change the default behaviour.
  • Loading branch information
rraval committed Dec 29, 2022
1 parent cfed01f commit cd70bae
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 19 deletions.
10 changes: 3 additions & 7 deletions pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,23 +141,19 @@ def skip_over_comment(stream: StreamType) -> None:
tok = stream.read(1)


def read_until_regex(
stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False
) -> bytes:
def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
"""
Read until the regular expression pattern matched (ignore the match).
Treats EOF on the underlying stream as the end of the token to be matched.
:raises PdfStreamError: on premature end-of-file
:param bool ignore_eof: If true, ignore end-of-line and return immediately
:param regex: re.Pattern
"""
name = b""
while True:
tok = stream.read(16)
if not tok:
if ignore_eof:
return name
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
return name
m = regex.search(tok)
if m is not None:
name += tok[: m.start()]
Expand Down
2 changes: 1 addition & 1 deletion pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
if name != NameObject.surfix:
raise PdfReadError("name read error")
name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True)
name += read_until_regex(stream, NameObject.delimiter_pattern)
try:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
Expand Down
2 changes: 1 addition & 1 deletion pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ def __parse_content_stream(self, stream: StreamType) -> None:
break
stream.seek(-1, 1)
if peek.isalpha() or peek in (b"'", b'"'):
operator = read_until_regex(stream, NameObject.delimiter_pattern, True)
operator = read_until_regex(stream, NameObject.delimiter_pattern)
if operator == b"BI":
# begin inline image - a completely different parsing
# mechanism is required, of course... thanks buddy...
Expand Down
11 changes: 1 addition & 10 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder):
assert stream.read() == remainder


def test_read_until_regex_premature_ending_raise():
import re

stream = io.BytesIO(b"")
with pytest.raises(PdfStreamError) as exc:
read_until_regex(stream, re.compile(b"."))
assert exc.value.args[0] == "Stream has ended unexpectedly"


def test_read_until_regex_premature_ending_name():
import re

stream = io.BytesIO(b"")
assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b""
assert read_until_regex(stream, re.compile(b".")) == b""


@pytest.mark.parametrize(
Expand Down

0 comments on commit cd70bae

Please sign in to comment.