Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

STY: Minor code-style improvements for _reader.py #2847

Merged
merged 16 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 70 additions & 55 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,19 @@
self.xref_objStm: Dict[int, Tuple[Any, Any]] = {}
self.trailer = DictionaryObject()

self._page_id2num: Optional[
Dict[Any, Any]
] = None # map page indirect_reference number to Page Number
# map page indirect_reference number to page number
self._page_id2num: Optional[Dict[Any, Any]] = None

self._initialize_stream(stream)

self._override_encryption = False
self._encryption: Optional[Encryption] = None
if self.is_encrypted:
self._handle_encryption(password)
elif password is not None:
raise PdfReadError("Not an encrypted file")

def _initialize_stream(self, stream: Union[StrByteType, Path]) -> None:
if hasattr(stream, "mode") and "b" not in stream.mode:
logger_warning(
"PdfReader stream/file object is not in binary mode. "
Expand All @@ -142,31 +152,25 @@
self.read(stream)
self.stream = stream

def _handle_encryption(self, password: Optional[Union[str, bytes]]) -> None:
self._override_encryption = True
# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/py-pdf/pypdf/issues/608
id_entry = self.trailer.get(TK.ID)
id1_entry = id_entry[0].get_object().original_bytes if id_entry else b""
encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
self._encryption = Encryption.read(encrypt_entry, id1_entry)

# try empty password if no password provided
pwd = password if password is not None else b""
if (
self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED
and password is not None
):
# raise if password provided
raise WrongPasswordError("Wrong password")
self._override_encryption = False
self._encryption: Optional[Encryption] = None
if self.is_encrypted:
self._override_encryption = True
# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/py-pdf/pypdf/issues/608
id_entry = self.trailer.get(TK.ID)
id1_entry = id_entry[0].get_object().original_bytes if id_entry else b""
encrypt_entry = cast(
DictionaryObject, self.trailer[TK.ENCRYPT].get_object()
)
self._encryption = Encryption.read(encrypt_entry, id1_entry)

# try empty password if no password provided
pwd = password if password is not None else b""
if (
self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED
and password is not None
):
# raise if password provided
raise WrongPasswordError("Wrong password")
self._override_encryption = False
elif password is not None:
raise PdfReadError("Not encrypted file")

def __enter__(self) -> "PdfReader":
return self
Expand Down Expand Up @@ -286,29 +290,30 @@
self, indirect_reference: Union[None, int, NullObject, IndirectObject]
) -> Optional[int]:
"""
Generate _page_id2num.
Retrieve the page number from an indirect reference.

Args:
indirect_reference:
indirect_reference: The indirect reference to locate.

Returns:
The page number or None
Page number or None.
"""
if self._page_id2num is None:
self._page_id2num = {
x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore
x.indirect_reference.idnum: i
for i, x in enumerate(self.pages)
if x.indirect_reference is not None
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
}

if is_null_or_none(indirect_reference):
return None
assert isinstance(indirect_reference, (int, IndirectObject)), "mypy"
if isinstance(indirect_reference, int):
idnum = indirect_reference
else:
idnum = indirect_reference.idnum
assert self._page_id2num is not None, "hint for mypy"
ret = self._page_id2num.get(idnum, None)
return ret
idnum = (
indirect_reference
if isinstance(indirect_reference, int)
else indirect_reference.idnum
)
return self._page_id2num.get(idnum)

def _get_object_from_stream(
self, indirect_reference: IndirectObject
Expand Down Expand Up @@ -562,6 +567,12 @@
return obj

def read(self, stream: StreamType) -> None:
"""
Read and process the PDF stream, extracting necessary data.

Args:
stream (StreamType): The PDF file stream.
"""
self._basic_validation(stream)
self._find_eof_marker(stream)
startxref = self._find_startxref_pos(stream)
Expand Down Expand Up @@ -621,7 +632,7 @@
stream.seek(loc, 0) # return to where it was

def _basic_validation(self, stream: StreamType) -> None:
"""Ensure file is not empty. Read at most 5 bytes."""
"""Ensure the stream is valid and not empty."""
stream.seek(0, os.SEEK_SET)
try:
header_byte = stream.read(5)
Expand Down Expand Up @@ -803,6 +814,7 @@
def _read_xref_tables_and_trailers(
self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int
) -> None:
"""Read the cross-reference tables and trailers in the PDF stream."""
self.xref = {}
self.xref_free_entry = {}
self.xref_objStm = {}
Expand All @@ -827,28 +839,31 @@
except Exception as e:
if TK.ROOT in self.trailer:
logger_warning(
f"Previous trailer can not be read {e.args}",
__name__,
f"Previous trailer cannot be read: {e.args}", __name__
)
break
else:
raise PdfReadError(f"trailer can not be read {e.args}")
trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE
for key in trailer_keys:
if key in xrefstream and key not in self.trailer:
self.trailer[NameObject(key)] = xrefstream.raw_get(key)
if "/XRefStm" in xrefstream:
p = stream.tell()
stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0)
self._read_pdf15_xref_stream(stream)
stream.seek(p, 0)
raise PdfReadError(f"Trailer cannot be read: {e.args}")

Check warning on line 846 in pypdf/_reader.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_reader.py#L846

Added line #L846 was not covered by tests
self._process_xref_stream(xrefstream)
if "/Prev" in xrefstream:
startxref = cast(int, xrefstream["/Prev"])
else:
break
else:
startxref = self._read_xref_other_error(stream, startxref)

def _process_xref_stream(self, xrefstream: DictionaryObject) -> None:
"""Process and handle the xref stream."""
trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE
for key in trailer_keys:
if key in xrefstream and key not in self.trailer:
self.trailer[NameObject(key)] = xrefstream.raw_get(key)
if "/XRefStm" in xrefstream:
p = self.stream.tell()
self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0)
self._read_pdf15_xref_stream(self.stream)
self.stream.seek(p, 0)

Check warning on line 865 in pypdf/_reader.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_reader.py#L862-L865

Added lines #L862 - L865 were not covered by tests

def _read_xref(self, stream: StreamType) -> Optional[int]:
self._read_standard_xref_table(stream)
if stream.read(1) == b"":
Expand Down Expand Up @@ -921,7 +936,7 @@
def _read_pdf15_xref_stream(
self, stream: StreamType
) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]:
# PDF 1.5+ Cross-Reference Stream
"""Read the cross-reference stream for PDF 1.5+."""
stream.seek(-1, 1)
idnum, generation = self.read_object_header(stream)
xrefstream = cast(ContentStream, read_object(stream, self))
Expand Down Expand Up @@ -1049,6 +1064,7 @@
get_entry: Callable[[int], Union[int, Tuple[int, ...]]],
used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool],
) -> None:
"""Read and process the subsections of the xref."""
for start, size in self._pairs(idx_pairs):
# The subsections must increase
for num in range(start, start + size):
Expand Down Expand Up @@ -1078,12 +1094,11 @@
raise PdfReadError(f"Unknown xref type: {xref_type}")

def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]:
"""Iterate over pairs in the array."""
i = 0
while True:
while i + 1 < len(array):
yield array[i], array[i + 1]
i += 2
if (i + 1) >= len(array):
break

def decrypt(self, password: Union[str, bytes]) -> PasswordType:
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def test_attempt_decrypt_unencrypted_pdf():
path = RESOURCE_ROOT / "crazyones.pdf"
with pytest.raises(PdfReadError) as exc:
PdfReader(path, password="nonexistent")
assert exc.value.args[0] == "Not encrypted file"
assert exc.value.args[0] == "Not an encrypted file"


@pytest.mark.skipif(not HAS_AES, reason="No AES implementation")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1293,7 +1293,7 @@ def test_reader(caplog):
url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf"
name = "shiv_resume.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Previous trailer can not be read" in caplog.text
assert "Previous trailer cannot be read" in caplog.text
caplog.clear()
# first call requires some reparations...
reader.pages[0].extract_text()
Expand Down
Loading