py-pdf · MartinThoma · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,11 +33,6 @@ repos:
     hooks:
     -   id: black
         args: [--target-version, py36]
-# -   repo: https://github.com/asottile/pyupgrade
-#     rev: v2.31.1
-#     hooks:
-#     -   id: pyupgrade
-#         args: [--py36-plus]
 -   repo: https://github.com/asottile/blacken-docs
     rev: v1.12.1
     hooks:

diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py
@@ -1,3 +1,12 @@
+"""
+PyPDF2 is a free and open-source pure-python PDF library capable of splitting,
+merging, cropping, and transforming the pages of PDF files. It can also add
+custom data, viewing options, and passwords to PDF files. PyPDF2 can retrieve
+text and metadata from PDFs as well.
+
+You can read the full docs at https://pypdf2.readthedocs.io/.
+"""
+
 from ._merger import PdfFileMerger, PdfMerger
 from ._page import PageObject, Transformation
 from ._reader import DocumentInformation, PdfFileReader, PdfReader

diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -248,8 +248,8 @@ def parse_to_unicode(
                 )  # join is here as some cases where the code was split
                 int_entry.append(int(lst[0], 16))
                 lst = lst[2:]
-    for a in map_dict:
-        if map_dict[a] == " ":
+    for a, value in map_dict.items():
+        if value == " ":
             space_code = a
     return map_dict, space_code, int_entry
 

diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py
@@ -45,16 +45,16 @@
     NumberObject,
     TextStringObject,
     TreeObject,
-    createStringObject,
+    _create_bookmark,
 )
 from .pagerange import PageRange, PageRangeSpec
 from .types import (
     BookmarkTypes,
+    FitType,
     LayoutType,
     OutlinesType,
     PagemodeType,
     ZoomArgsType,
-    ZoomArgType,
 )
 
 ERR_CLOSED_WRITER = "close() was called and thus the writer cannot be used anymore"
@@ -564,9 +564,7 @@ def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None:
             if pageno is not None:
                 nd[NameObject("/Page")] = NumberObject(pageno)
             else:
-                raise ValueError(
-                    "Unresolved named destination '{}'".format(nd["/Title"])
-                )
+                raise ValueError(f"Unresolved named destination '{nd['/Title']}'")
 
     def _associate_bookmarks_to_pages(
         self, pages: List[_MergedPage], bookmarks: Optional[Iterable[Bookmark]] = None
@@ -592,7 +590,7 @@ def _associate_bookmarks_to_pages(
             if pageno is not None:
                 b[NameObject("/Page")] = NumberObject(pageno)
             else:
-                raise ValueError("Unresolved bookmark '{}'".format(b["/Title"]))
+                raise ValueError(f"Unresolved bookmark '{b['/Title']}'")
 
     def find_bookmark(
         self,
@@ -623,8 +621,8 @@ def addBookmark(
         color: Optional[Tuple[float, float, float]] = None,
         bold: bool = False,
         italic: bool = False,
-        fit: str = "/Fit",
-        *args: ZoomArgType,
+        fit: FitType = "/Fit",
+        *args: ZoomArgsType,
     ) -> IndirectObject:  # pragma: no cover
         """
         .. deprecated:: 1.28.0
@@ -643,8 +641,8 @@ def add_bookmark(
         color: Optional[Tuple[float, float, float]] = None,
         bold: bool = False,
         italic: bool = False,
-        fit: str = "/Fit",
-        *args: ZoomArgType,
+        fit: FitType = "/Fit",
+        *args: ZoomArgsType,
     ) -> IndirectObject:
         """
         Add a bookmark to this PDF file.
@@ -658,7 +656,7 @@ def add_bookmark(
         :param bool bold: Bookmark is bold
         :param bool italic: Bookmark is italic
         :param str fit: The fit of the destination page. See
-            :meth:`addLink()<addLin>` for details.
+            :meth:`addLink()<addLink>` for details.
         """
         if self.output is None:
             raise RuntimeError(ERR_CLOSED_WRITER)
@@ -689,32 +687,13 @@ def add_bookmark(
         if parent is None:
             parent = outline_ref
 
-        bookmark = TreeObject()
+        bookmark = _create_bookmark(action_ref, title, color, italic, bold)
 
-        bookmark.update(
-            {
-                NameObject("/A"): action_ref,
-                NameObject("/Title"): createStringObject(title),
-            }
-        )
-
-        if color is not None:
-            bookmark.update(
-                {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])}
-            )
-
-        format_flag = 0
-        if italic:
-            format_flag += 1
-        if bold:
-            format_flag += 2
-        if format_flag:
-            bookmark.update({NameObject("/F"): NumberObject(format_flag)})
-
-        bookmark_ref = self.output._add_object(bookmark)
-        parent = cast(Bookmark, parent.get_object())
         assert parent is not None, "hint for mypy"
-        parent.add_child(bookmark_ref, self.output)
+        bookmark_ref = self.output._add_object(bookmark)
+        parent_obj = cast(Bookmark, parent.get_object())
+        assert parent_obj is not None, "hint for mypy"
+        parent_obj.add_child(bookmark_ref, self.output)
 
         return bookmark_ref
 

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -374,7 +374,7 @@ def _content_stream_rename(
                     if isinstance(op, NameObject):
                         operands[i] = rename.get(op, op)
             else:
-                raise KeyError("type of operands is %s" % type(operands))
+                raise KeyError(f"type of operands is {type(operands)}")
         return stream
 
     @staticmethod

diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py
@@ -803,7 +803,7 @@ def _build_outline(self, node: DictionaryObject) -> Optional[Destination]:
                 outline = self._namedDests[dest]
                 outline[NameObject("/Title")] = title  # type: ignore
             else:
-                raise PdfReadError("Unexpected destination %r" % dest)
+                raise PdfReadError(f"Unexpected destination {dest!r}")
         return outline
 
     @property
@@ -993,13 +993,14 @@ def _get_object_from_stream(
                 # Stream object cannot be read. Normally, a critical error, but
                 # Adobe Reader doesn't complain, so continue (in strict mode?)
                 warnings.warn(
-                    "Invalid stream (index %d) within object %d %d: %s"
-                    % (i, indirect_reference.idnum, indirect_reference.generation, exc),
+                    f"Invalid stream (index {i}) within object "
+                    f"{indirect_reference.idnum} {indirect_reference.generation}: "
+                    f"{exc}",
                     PdfReadWarning,
                 )
 
                 if self.strict:
-                    raise PdfReadError("Can't read object stream: %s" % exc)
+                    raise PdfReadError(f"Can't read object stream: {exc}")
                 # Replace with null. Hopefully it's nothing important.
                 obj = NullObject()
             return obj
@@ -1030,26 +1031,18 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
                 # Xref table probably had bad indexes due to not being zero-indexed
                 if self.strict:
                     raise PdfReadError(
-                        "Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed."
-                        % (
-                            indirect_reference.idnum,
-                            indirect_reference.generation,
-                            idnum,
-                            generation,
-                        )
+                        f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) "
+                        f"does not match actual ({idnum} {generation}); "
+                        "xref table not zero-indexed."
                     )
                 else:
                     pass  # xref table is corrected in non-strict mode
             elif idnum != indirect_reference.idnum and self.strict:
                 # some other problem
                 raise PdfReadError(
-                    "Expected object ID (%d %d) does not match actual (%d %d)."
-                    % (
-                        indirect_reference.idnum,
-                        indirect_reference.generation,
-                        idnum,
-                        generation,
-                    )
+                    f"Expected object ID ({indirect_reference.idnum} "
+                    f"{indirect_reference.generation}) does not match actual "
+                    f"({idnum} {generation})."
                 )
             if self.strict:
                 assert generation == indirect_reference.generation
@@ -1070,8 +1063,8 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
                 retval = self._decrypt_object(retval, key)  # type: ignore
         else:
             warnings.warn(
-                "Object %d %d not defined."
-                % (indirect_reference.idnum, indirect_reference.generation),
+                f"Object {indirect_reference.idnum} {indirect_reference.generation} "
+                "not defined.",
                 PdfReadWarning,
             )
             if self.strict:
@@ -1144,8 +1137,7 @@ def read_object_header(self, stream: StreamType) -> Tuple[int, int]:
         stream.seek(-1, 1)
         if extra and self.strict:
             warnings.warn(
-                "Superfluous whitespace found in object header %s %s"
-                % (idnum, generation),  # type: ignore
+                f"Superfluous whitespace found in object header {idnum} {generation}",  # type: ignore
                 PdfReadWarning,
             )
         return int(idnum), int(generation)
@@ -1212,9 +1204,8 @@ def read(self, stream: StreamType) -> None:
             header_byte = stream.read(5)
             if header_byte != b"%PDF-":
                 raise PdfReadError(
-                    "PDF starts with '{}', but '%PDF-' expected".format(
-                        header_byte.decode("utf8")
-                    )
+                    f"PDF starts with '{header_byte.decode('utf8')}', "
+                    "but '%PDF-' expected"
                 )
             stream.seek(0, os.SEEK_END)
         last_mb = stream.tell() - 1024 * 1024 + 1  # offset of last MB of stream
@@ -1237,8 +1228,8 @@ def read(self, stream: StreamType) -> None:
                 )
 
         # read all cross reference tables and their trailers
-        self.xref: Dict[Any, Any] = {}
-        self.xref_objStm: Dict[Any, Any] = {}
+        self.xref: Dict[int, Dict[Any, Any]] = {}
+        self.xref_objStm: Dict[int, Tuple[Any, Any]] = {}
         self.trailer = DictionaryObject()
         while True:
             # load the xref table
@@ -1311,13 +1302,13 @@ def read(self, stream: StreamType) -> None:
         # if not zero-indexed, verify that the table is correct; change it if necessary
         if self.xref_index and not self.strict:
             loc = stream.tell()
-            for gen in self.xref:
+            for gen, xref_entry in self.xref.items():
                 if gen == 65535:
                     continue
-                for id in self.xref[gen]:
-                    stream.seek(self.xref[gen][id], 0)
+                for id in xref_entry:
+                    stream.seek(xref_entry[id], 0)
                     try:
-                        pid, pgen = self.read_object_header(stream)
+                        pid, _pgen = self.read_object_header(stream)
                     except ValueError:
                         break
                     if pid == id - self.xref_index:
@@ -1431,7 +1422,7 @@ def _read_pdf15_xref_stream(
         entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W"))
         assert len(entry_sizes) >= 3
         if self.strict and len(entry_sizes) > 3:
-            raise PdfReadError("Too many entry sizes: %s" % entry_sizes)
+            raise PdfReadError(f"Too many entry sizes: {entry_sizes}")
 
         def get_entry(i: int) -> Union[int, Tuple[int, ...]]:
             # Reads the correct number of bytes for each entry. See the
@@ -1449,7 +1440,7 @@ def get_entry(i: int) -> Union[int, Tuple[int, ...]]:
 
         def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool:
             # We move backwards through the xrefs, don't replace any.
-            return num in self.xref.get(generation, []) or num in self.xref_objStm
+            return num in self.xref.get(generation, []) or num in self.xref_objStm  # type: ignore
 
         # Iterate through each subsection
         self._read_xref_subsections(idx_pairs, get_entry, used_before)
@@ -1528,9 +1519,9 @@ def _read_xref_subsections(
                     byte_offset = get_entry(1)
                     generation = get_entry(2)
                     if generation not in self.xref:
-                        self.xref[generation] = {}
+                        self.xref[generation] = {}  # type: ignore
                     if not used_before(num, generation):
-                        self.xref[generation][num] = byte_offset
+                        self.xref[generation][num] = byte_offset  # type: ignore
                 elif xref_type == 2:
                     # compressed objects
                     objstr_num = get_entry(1)
@@ -1539,7 +1530,7 @@ def _read_xref_subsections(
                     if not used_before(num, generation):
                         self.xref_objStm[num] = (objstr_num, obstr_idx)
                 elif self.strict:
-                    raise PdfReadError("Unknown xref type: %s" % xref_type)
+                    raise PdfReadError(f"Unknown xref type: {xref_type}")
 
     def _zero_xref(self, generation: int) -> None:
         self.xref[generation] = {
@@ -1568,11 +1559,11 @@ def read_next_end_line(
             if stream.tell() < 2:
                 raise PdfReadError("EOL marker not found")
             stream.seek(-2, 1)
-            if x == b"\n" or x == b"\r":  # \n = LF; \r = CR
+            if x in (b"\n", b"\r"):  # \n = LF; \r = CR
                 crlf = False
-                while x == b"\n" or x == b"\r":
+                while x in (b"\n", b"\r"):
                     x = stream.read(1)
-                    if x == b"\n" or x == b"\r":  # account for CR+LF
+                    if x in (b"\n", b"\r"):  # account for CR+LF
                         stream.seek(-1, 1)
                         crlf = True
                     if stream.tell() < 2:
@@ -1657,8 +1648,7 @@ def _decrypt(self, password: Union[str, bytes]) -> int:
         encrypt_v = cast(int, encrypt["/V"])
         if encrypt_v not in (1, 2):
             raise NotImplementedError(
-                "only algorithm code 1 and 2 are supported. This PDF uses code %s"
-                % encrypt_v
+                f"only algorithm code 1 and 2 are supported. This PDF uses code {encrypt_v}"
             )
         user_password, key = self._authenticate_user_password(password)
         if user_password:
@@ -1678,8 +1668,8 @@ def _decrypt(self, password: Union[str, bytes]) -> int:
                 val = real_O
                 for i in range(19, -1, -1):
                     new_key = b""
-                    for l in range(len(key)):
-                        new_key += b_(chr(ord_(key[l]) ^ i))
+                    for key_char in key:
+                        new_key += b_(chr(ord_(key_char) ^ i))
                     val = RC4_encrypt(new_key, val)
                 userpass = val
             owner_password, key = self._authenticate_user_password(userpass)

diff --git a/PyPDF2/_security.py b/PyPDF2/_security.py
@@ -119,8 +119,8 @@ def _alg33(owner_pwd: str, user_pwd: str, rev: int, keylen: int) -> bytes:
     if rev >= 3:
         for i in range(1, 20):
             new_key = ""
-            for l in range(len(key)):
-                new_key += chr(ord_(key[l]) ^ i)
+            for key_char in key:
+                new_key += chr(ord_(key_char) ^ i)
             val = RC4_encrypt(new_key, val)
     # 8. Store the output from the final invocation of the RC4 as the value of
     # the /O entry in the encryption dictionary.
@@ -233,10 +233,10 @@ def RC4_encrypt(key: Union[str, bytes], plaintext: bytes) -> bytes:
         S[i], S[j] = S[j], S[i]
     i, j = 0, 0
     retval = []
-    for x in range(len(plaintext)):
+    for plaintext_char in plaintext:
         i = (i + 1) % 256
         j = (j + S[i]) % 256
         S[i], S[j] = S[j], S[i]
         t = S[(S[i] + S[j]) % 256]
-        retval.append(b_(chr(ord_(plaintext[x]) ^ t)))
+        retval.append(b_(chr(ord_(plaintext_char) ^ t)))
     return b"".join(retval)