ENH: Compress PDF files merging identical objects (#2795)

Add compress_identical_objects(). Discovered in #2728. Closes #2794. Closes #2768.
py-pdf · Aug 13, 2024 · cf7fcfd · cf7fcfd
1 parent a9758ae
commit cf7fcfd
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 137 deletions.
diff --git a/docs/user/file-size.md b/docs/user/file-size.md
@@ -9,23 +9,17 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-This can be done by reading and writing the file:
+When adding data to a PdfWriter, the data is copied while respecting the original format.
+For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object.
 
-```python
-from pypdf import PdfReader, PdfWriter
-
-reader = PdfReader("big-old-file.pdf")
-writer = PdfWriter()
+Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 
-for page in reader.pages:
-    writer.add_page(page)
+In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)`
 
-if reader.metadata is not None:
-    writer.add_metadata(reader.metadata)
+* `remove_identicals` enables/disables compression merging identical objects.
+* `remove_orphans` enables/disables suppression of unused objects.
 
-with open("smaller-new-file.pdf", "wb") as fp:
-    writer.write(fp)
-```
+It is recommended to apply this process just before writing to the file/stream.
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -44,7 +44,7 @@ def __post_init__(self) -> None:
                 self.font_dictionary["/DescendantFonts"]
             ):
                 while isinstance(d_font, IndirectObject):
-                    d_font = d_font.get_object()  # type: ignore[assignment]
+                    d_font = d_font.get_object()
                 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
                 ord_map = {
                     ord(_target): _surrogate
@@ -75,20 +75,32 @@ def __post_init__(self) -> None:
                             {
                                 ord_map[_cidx]: _width
                                 for _cidx, _width in zip(
-                                    range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
+                                    range(
+                                        cast(int, start_idx),
+                                        cast(int, start_idx) + len(width_list),
+                                        1,
+                                    ),
                                     width_list,
                                 )
                                 if _cidx in ord_map
                             }
                         )
                         skip_count = 1
                     # check for format (2): `int int int`
-                    elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
-                        start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
+                    elif isinstance(w_next_entry, (int, float)) and isinstance(
+                        _w[idx + 2].get_object(), (int, float)
+                    ):
+                        start_idx, stop_idx, const_width = (
+                            w_entry,
+                            w_next_entry,
+                            _w[idx + 2].get_object(),
+                        )
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: const_width
-                                for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
+                                for _cidx in range(
+                                    cast(int, start_idx), cast(int, stop_idx + 1), 1
+                                )
                                 if _cidx in ord_map
                             }
                         )