Merge branch 'main' into encodedstream

py-pdf · Jun 11, 2023 · de8e22f · de8e22f
2 parents d4b1429 + 54e027a
commit de8e22f
Show file tree

Hide file tree

Showing 17 changed files with 130 additions and 81 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,41 +1,48 @@
-Version 3.9.0, 2023-05-21
--------------------------
+# CHANGELOG
+
+## Version 3.9.1, 2023-06-04
+
+### Deprecations (DEP)
+-  Deprecate PdfMerger (#1866)
+
+### Bug Fixes (BUG)
+-  Ignore UTF-8 decode errors (#1865)
+
+### Robustness (ROB)
+-  Handle missing /Type entry in Page tree (#1859)
+
+
+[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.9.0...3.9.1)
 
-New Features (ENH):
+## Version 3.9.0, 2023-05-21
+
+### New Features (ENH)
 -  Simplify metadata input (Document Information Dictionary) (#1851)
 -  Extend cmap compatibilty to GBK_EUC_H/V (#1812)
 
-Bug Fixes (BUG):
+### Bug Fixes (BUG)
 -  Prevent infinite loop when no character follows after a comment (#1828)
 -  get_contents does not return ContentStream (#1847)
 -  Accept XYZ destination with zoom missing (default to zoom=0.0) (#1844)
 -  Cope with 1 Bit images (#1815)
 
-Robustness (ROB):
+### Robustness (ROB)
 -  Handle missing /Type entry in Page tree (#1845)
 
-Documentation (DOC):
+### Documentation (DOC)
 -  Expand file size explanations (#1835)
 -  Add comparison with pdfplumber (#1837)
 -  Clarify that PyPDF2 is dead (#1827)
 -  Add Hunter King as Contributor for #1806
 
-Maintenance (MAINT):
+### Maintenance (MAINT)
 -  Refactor internal Encryption class (#1821)
 -  Add R parameter to generate_values (#1820)
 -  Make encryption_key parameter of write_to_stream optional (#1819)
 -  Prepare for adding AES enryption support (#1818)
 
-Testing (TST):
--  Parametrize test_cmap_encodings (#1823)
-
-Code Style (STY):
--  Iterate directly over the list instead of using range (#1839)
--  Minor refactorings in _encryption.py (#1822)
+[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.1...3.9.0)
 
-[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.1...3.8.2)
-
-# CHANGELOG
 
 ## Version 3.8.1, 2023-04-23
 

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -34,6 +34,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra
 * [Pinheiro, Arthur](https://github.com/xilopaint)
 * [programmarchy](https://github.com/programmarchy)
 * [pubpub-zz](https://github.com/pubpub-zz): involved in community development
+* [RitchieP](https://github.com/RitchieP) | [LinkedIn](https://www.linkedin.com/in/ritchie-p-892b31115/) | [StackOverflow](https://stackoverflow.com/users/13328625/casual-r?tab=profile)
 * [Rogmann, Sascha](https://github.com/srogmann)
 * [robbiebusinessacc](https://github.com/robbiebusinessacc)
 * [sietzeberends](https://github.com/sietzeberends)

diff --git a/docs/meta/comparisons.md b/docs/meta/comparisons.md
@@ -56,7 +56,7 @@ than PyPDF2. See [history of pypdf](history.md).
 extracting the [font size](https://stackoverflow.com/a/69962459/562769)
 / font weight (bold-ness). It has no capabilities for writing PDF files.
 
-[`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file.
+[`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. Please note that the image conversion is done via ImageMagick (see [`pdfplumber`'s documentation](https://github.com/jsvine/pdfplumber#visual-debugging)).
 
 The `pdfplumber` community is active in answering questions and the library is maintained as of May 2023.
 

diff --git a/make_changelog.py b/make_changelog.py
@@ -25,13 +25,14 @@ def main(changelog_path: str) -> None:
     changelog = get_changelog(changelog_path)
     git_tag = get_most_recent_git_tag()
     changes = get_formatted_changes(git_tag)
-    print("-" * 80)
+    if changes == "":
+        print("No changes")
+        return
     print(changes)
 
     new_version = version_bump(git_tag)
     today = datetime.now(tz=timezone.utc)
-    header = f"Version {new_version}, {today:%Y-%m-%d}\n"
-    header = header + "-" * (len(header) - 1) + "\n"
+    header = f"## Version {new_version}, {today:%Y-%m-%d}\n"
     url = f"https://github.com/py-pdf/pypdf/compare/{git_tag}...{new_version}"
     trailer = f"\n[Full Changelog]({url})\n\n"
     new_entry = header + changes + trailer
@@ -42,10 +43,15 @@ def main(changelog_path: str) -> None:
         print("Changelog is already up-to-date!")
         return
 
-    new_changelog = new_entry + changelog
+    new_changelog = "# CHANGELOG\n\n" + new_entry + strip_header(changelog)
     write_changelog(new_changelog, changelog_path)
 
 
+def strip_header(md: str) -> str:
+    """Remove the 'CHANGELOG' header."""
+    return md.lstrip("# CHANGELOG").strip()  # noqa
+
+
 def version_bump(git_tag: str) -> str:
     """
     Increase the patch version of the git tag by one.
@@ -117,6 +123,7 @@ def get_formatted_changes(git_tag: str) -> str:
         "ROB",
         "DOC",
         "DEV",
+        "CI",
         "MAINT",
         "TST",
         "STY",
@@ -129,6 +136,7 @@ def get_formatted_changes(git_tag: str) -> str:
         "ROB": "Robustness",
         "DOC": "Documentation",
         "DEV": "Developer Experience",
+        "CI": "Continuous Integration",
         "MAINT": "Maintenance",
         "TST": "Testing",
         "STY": "Code Style",
@@ -140,17 +148,15 @@ def get_formatted_changes(git_tag: str) -> str:
     for prefix in order:
         if prefix not in grouped:
             continue
-        output += f"\n{abbrev2long[prefix]} ({prefix}):\n"  # header
+        output += f"\n### {abbrev2long[prefix]} ({prefix})\n"  # header
         for commit in grouped[prefix]:
             output += f"- {commit['msg']}\n"
         del grouped[prefix]
 
     if grouped:
-        print("@" * 80)
-        output += "\nYou forgot something!:\n"
+        output += "\n### Other\n"
         for prefix in grouped:
             output += f"- {prefix}: {grouped[prefix]}\n"
-        print("@" * 80)
 
     return output
 
@@ -193,7 +199,7 @@ def get_git_commits_since_tag(git_tag: str) -> List[Change]:
             stderr=subprocess.STDOUT,
         )
     ).strip("'b\\n")
-    return [parse_commit_line(line) for line in commits.split("\\n")]
+    return [parse_commit_line(line) for line in commits.split("\\n") if line != ""]
 
 
 def parse_commit_line(line: str) -> Change:
@@ -210,7 +216,7 @@ def parse_commit_line(line: str) -> Change:
         ValueError: The commit line is not well-structured
     """
     if "\\t" not in line:
-        raise ValueError(f"Invalid commit line: {line}")
+        raise ValueError(f"Invalid commit line: '{line}'")
     commit_hash, rest = line.split("\\t", 1)
     if ":" in rest:
         prefix, message = rest.split(":", 1)

diff --git a/pypdf/_merger.py b/pypdf/_merger.py
@@ -46,6 +46,7 @@
 from ._reader import PdfReader
 from ._utils import (
     StrByteType,
+    deprecate_with_replacement,
     deprecation_bookmark,
     deprecation_with_replacement,
     str_,
@@ -86,26 +87,16 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
 
 class PdfMerger:
     """
-    Initialize a ``PdfMerger`` object.
+    Use :class:`PdfWriter` instead.
 
-    ``PdfMerger`` merges multiple PDFs into a single PDF.
-    It can concatenate, slice, insert, or any combination of the above.
-
-    See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`)
-    and :meth:`write()<write>` for usage information.
-
-    Args:
-        strict: Determines whether user should be warned of all
-            problems and also causes some correctable problems to be fatal.
-            Defaults to ``False``.
-        fileobj: Output file. Can be a filename or any kind of
-            file-like object.
+    .. deprecated:: 5.0.0
     """
 
     @deprecation_bookmark(bookmarks="outline")
     def __init__(
         self, strict: bool = False, fileobj: Union[Path, StrByteType] = ""
     ) -> None:
+        deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
         self.inputs: List[Tuple[Any, PdfReader]] = []
         self.pages: List[Any] = []
         self.output: Optional[PdfWriter] = PdfWriter()
@@ -117,6 +108,7 @@ def __init__(
 
     def __enter__(self) -> "PdfMerger":
         # There is nothing to do.
+        deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
         return self
 
     def __exit__(
@@ -522,13 +514,13 @@ def _write_dests(self) -> None:
             raise RuntimeError(ERR_CLOSED_WRITER)
         for named_dest in self.named_dests:
             page_index = None
-            if "/Page" in named_dest:
+            if "/Page" in named_dest:  # deprecated
                 for page_index, page in enumerate(self.pages):  # noqa: B007
                     if page.id == named_dest["/Page"]:
                         named_dest[NameObject("/Page")] = page.out_pagedata
                         break
 
-            if page_index is not None:
+            if page_index is not None:  # deprecated
                 self.output.add_named_destination_object(named_dest)
 
     @deprecation_bookmark(bookmarks="outline")
@@ -606,7 +598,7 @@ def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None:
                 if np.get_object() == page.pagedata.get_object():
                     page_index = page.id
 
-            if page_index is None:
+            if page_index is None:  # deprecated
                 raise ValueError(
                     f"Unresolved named destination '{named_dest['/Title']}'"
                 )
@@ -651,7 +643,7 @@ def find_outline_item(
                 # oi_enum is still an inner node
                 # (OutlineType, if recursive types were supported by mypy)
                 res = self.find_outline_item(outline_item, oi_enum)  # type: ignore
-                if res:
+                if res:  # deprecated
                     return [i] + res
             elif (
                 oi_enum == outline_item

diff --git a/pypdf/_version.py b/pypdf/_version.py
@@ -1 +1 @@
-__version__ = "3.9.0"
+__version__ = "3.9.1"
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -940,9 +940,14 @@ def _flatten(
             pages = cast(DictionaryObject, self._root_object["/Pages"])
             self.flattened_pages = ArrayObject()
         assert pages is not None  # hint for mypy
-        t = "/Pages"
+
         if PA.TYPE in pages:
-            t = cast(str, pages[PA.TYPE])
+            t = str(pages[PA.TYPE])
+        # if pdf has no type, considered as a page if /Kids is missing
+        elif PA.KIDS not in pages:
+            t = "/Page"
+        else:
+            t = "/Pages"
 
         if t == "/Pages":
             for attr in inheritable_page_attributes:

diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py
@@ -104,7 +104,7 @@ def read_string_from_stream(
                     # line break was escaped:
                     tok = b""
                 else:
-                    msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
+                    msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}"
                     logger_warning(msg, __name__)
         txt.append(tok)
     return create_string_object(b"".join(txt), forced_encoding)

diff --git a/pyproject.toml b/pyproject.toml
@@ -41,7 +41,7 @@ Changelog = "https://pypdf.readthedocs.io/en/latest/meta/CHANGELOG.html"
 full = ["PyCryptodome", "Pillow"]
 crypto = ["PyCryptodome"]
 image = ["Pillow"]
-dev = ["black", "pip-tools", "pre-commit<2.18.0", "pytest-cov", "flit", "wheel"]
+dev = ["black", "pip-tools", "pre-commit<2.18.0", "pytest-cov", "pytest-socket", "flit", "wheel"]
 docs = ["sphinx", "sphinx_rtd_theme", "myst_parser"]
 
 [tool.mutmut]

diff --git a/tests/bench.py b/tests/bench.py
@@ -66,43 +66,42 @@ def merge():
     pdf_forms = RESOURCE_ROOT / "pdflatex-forms.pdf"
     pdf_pw = RESOURCE_ROOT / "libreoffice-writer-password.pdf"
 
-    merger = pypdf.PdfMerger()
+    writer = PdfWriter()
 
     # string path:
-    merger.append(pdf_path)
-    merger.append(outline)
-    merger.append(pdf_path, pages=pypdf.pagerange.PageRange(slice(0, 0)))
-    merger.append(pdf_forms)
+    writer.append(pdf_path)
+    writer.append(outline)
+    writer.append(pdf_path, pages=pypdf.pagerange.PageRange(slice(0, 0)))
+    writer.append(pdf_forms)
 
     # Merging an encrypted file
     reader = pypdf.PdfReader(pdf_pw)
     reader.decrypt("openpassword")
-    merger.append(reader)
+    writer.append(reader)
 
     # PdfReader object:
-    merger.append(pypdf.PdfReader(pdf_path, "rb"), outline_item=True)
+    writer.append(pypdf.PdfReader(pdf_path, "rb"), outline_item=True)
 
     # File handle
     with open(pdf_path, "rb") as fh:
-        merger.append(fh)
+        writer.append(fh)
 
-    outline_item = merger.add_outline_item("An outline item", 0)
-    merger.add_outline_item("deeper", 0, parent=outline_item)
-    merger.add_metadata({"author": "Martin Thoma"})
-    merger.add_named_destination("title", 0)
-    merger.set_page_layout("/SinglePage")
-    merger.set_page_mode("/UseThumbs")
+    outline_item = writer.add_outline_item("An outline item", 0)
+    writer.add_outline_item("deeper", 0, parent=outline_item)
+    writer.add_metadata({"author": "Martin Thoma"})
+    writer.add_named_destination("title", 0)
+    writer.set_page_layout("/SinglePage")
+    writer.set_page_mode("/UseThumbs")
 
     write_path = "dont_commit_merged.pdf"
-    merger.write(write_path)
-    merger.close()
+    writer.write(write_path)
+    writer.close()
 
     # Check if outline is correct
     reader = pypdf.PdfReader(write_path)
     assert [
         el.title for el in reader._get_outline() if isinstance(el, Destination)
     ] == [
-        "An outline item",
         "Foo",
         "Bar",
         "Baz",
@@ -113,6 +112,7 @@ def merge():
         "Bar",
         "Baz",
         "True",
+        "An outline item",
     ]
 
 

diff --git a/tests/test_encryption.py b/tests/test_encryption.py
@@ -168,6 +168,7 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password):
     ],
 )
 @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome")
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_merge_encrypted_pdfs(names):
     """Encrypted PDFs can be merged after decryption."""
     merger = pypdf.PdfMerger()

diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -678,6 +678,7 @@ def test_bool_repr(tmp_path):
 
 @pytest.mark.enable_socket()
 @patch("pypdf._reader.logger_warning")
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_issue_997(mock_logger_warning, pdf_file_path):
     url = (
         "https://github.com/py-pdf/pypdf/files/8908874/"