Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: Fix infinite loop due to Invalid object #1331

Merged
merged 2 commits into from
Sep 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ._codecs import adobe_glyphs, charset_encoding
from ._utils import logger_warning
from .errors import PdfReadWarning
from .generic import DecodedStreamObject, DictionaryObject
from .generic import DecodedStreamObject, DictionaryObject, NameObject


# code freely inspired from @twiggy ; see #711
Expand Down Expand Up @@ -124,6 +124,7 @@ def parse_encoding(
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
enc = NameObject.unnumber(enc) # for #xx decoding
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
Expand Down
9 changes: 9 additions & 0 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
buf = bytes(self.stream.getbuffer()) # type: ignore
else:
p = self.stream.tell()
self.stream.seek(0, 0)
buf = self.stream.read(-1)
self.stream.seek(p, 0)
m = re.search(
Expand Down Expand Up @@ -1192,6 +1193,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
buf = bytes(self.stream.getbuffer()) # type: ignore
else:
p = self.stream.tell()
self.stream.seek(0, 0)
buf = self.stream.read(-1)
self.stream.seek(p, 0)
m = re.search(
Expand Down Expand Up @@ -1883,6 +1885,13 @@ def xfa(self) -> Optional[Dict[str, Any]]:
retval[tag] = es
return retval

def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
"""
used to ease development
equivalent to generic.IndirectObject(num,gen,self).get_object()
"""
return IndirectObject(num, gen, self).get_object()


class PdfFileReader(PdfReader): # pragma: no cover
def __init__(self, *args: Any, **kwargs: Any) -> None:
Expand Down
13 changes: 11 additions & 2 deletions PyPDF2/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,14 @@ def writeToStream(
deprecate_with_replacement("writeToStream", "write_to_stream")
self.write_to_stream(stream, encryption_key)

@staticmethod
def unnumber(sin: str) -> str:
i = sin.find("#")
while i >= 0:
sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :]
i = sin.find("#")
return sin

@staticmethod
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
Expand All @@ -431,10 +439,11 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
ret = name.decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
ret = name.decode("gbk")
return NameObject(ret)
except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
ret = NameObject.unnumber(ret)
return NameObject(ret)
except (UnicodeEncodeError, UnicodeDecodeError) as e:
if not pdf.strict:
logger_warning("Illegal character in Name Object", __name__)
return NameObject(name)
Expand Down
39 changes: 25 additions & 14 deletions PyPDF2/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
from ._utils import read_hex_string_from_stream, read_string_from_stream

logger = logging.getLogger(__name__)
ObjectPrefix = b"/<[tf(n%"
NumberSigns = b"+-"
IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")

Expand Down Expand Up @@ -263,10 +262,19 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader
stream.read(1)
break
stream.seek(-1, 1)
key = read_object(stream, pdf)
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
value = read_object(stream, pdf, forced_encoding)
try:
key = read_object(stream, pdf)
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
value = read_object(stream, pdf, forced_encoding)
except Exception as exc:
if pdf is not None and pdf.strict:
raise PdfReadError(exc.__repr__())
logger_warning(exc.__repr__(), __name__)
retval = DictionaryObject()
retval.update(data)
return retval # return partial data

if not data.get(key):
data[key] = value
else:
Expand Down Expand Up @@ -812,10 +820,9 @@ def read_object(
) -> Union[PdfObject, int, str, ContentStream]:
tok = stream.read(1)
stream.seek(-1, 1) # reset to start
idx = ObjectPrefix.find(tok)
if idx == 0:
if tok == b"/":
return NameObject.read_from_stream(stream, pdf)
elif idx == 1:
elif tok == b"<":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice that you got rid of ObjectPrefix / idx. It's way easier to read now 👍

# hexadecimal string OR dictionary
peek = stream.read(2)
stream.seek(-2, 1) # reset to start
Expand All @@ -824,15 +831,15 @@ def read_object(
return DictionaryObject.read_from_stream(stream, pdf, forced_encoding)
else:
return read_hex_string_from_stream(stream, forced_encoding)
elif idx == 2:
elif tok == b"[":
return ArrayObject.read_from_stream(stream, pdf, forced_encoding)
elif idx == 3 or idx == 4:
elif tok == b"t" or tok == b"f":
return BooleanObject.read_from_stream(stream)
elif idx == 5:
elif tok == b"(":
return read_string_from_stream(stream, forced_encoding)
elif idx == 6:
elif tok == b"n":
return NullObject.read_from_stream(stream)
elif idx == 7:
elif tok == b"%":
# comment
while tok not in (b"\r", b"\n"):
tok = stream.read(1)
Expand All @@ -843,14 +850,18 @@ def read_object(
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
return read_object(stream, pdf, forced_encoding)
else:
elif tok in b"0123456789+-.":
# number object OR indirect reference
peek = stream.read(20)
stream.seek(-len(peek), 1) # reset to start
if IndirectPattern.match(peek) is not None:
return IndirectObject.read_from_stream(stream, pdf)
else:
return NumberObject.read_from_stream(stream)
else:
raise PdfReadError(
f"Invalid Elementary Object starting with {tok} @{stream.tell()}" # type: ignore
)


class Field(TreeObject):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,17 @@ def test_NameObject():
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, None)
assert exc.value.args[0] == "name read error"
assert (
NameObject.read_from_stream(
BytesIO(b"/A;Name_With-Various***Characters?"), None
)
== "/A;Name_With-Various***Characters?"
)
assert (
NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None)
== "/paired()parentheses"
)
assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"


def test_destination_fit_r():
Expand Down