py-pdf · stefan6419846 · Aug 13, 2024 · Aug 11, 2024 · Aug 11, 2024 · Aug 11, 2024
diff --git a/docs/user/file-size.md b/docs/user/file-size.md
@@ -9,23 +9,25 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-This can be done by reading and writing the file:
+When adding data to a PdfWriter, the data are copied respecting the original format.
+For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object
 
-```python
-from pypdf import PdfReader, PdfWriter
+Also when you delete objects in a document, pypdf can not easily identify weither the objects are used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within.
 
-reader = PdfReader("big-old-file.pdf")
-writer = PdfWriter()
+in order to reduce the file size a compression process:
+`writer.compress_identical_objects(remove_identicals = True, remove_orphans= True, verbose = -1)`
 
-for page in reader.pages:
-    writer.add_page(page)
+`remove_identical` enables / disables compression merging identical objects
 
-if reader.metadata is not None:
-    writer.add_metadata(reader.metadata)
+`remove_orphans` enables / disables suppression of unused objects
+
+`verbose` sets the value on how many objects are processed
+the progress status (printed on stderr) of the compression is printed as follow:
+* '+' during initial loop
+* '.' when replacing duplicates
+
+It is recommended to apply this process just before writing to file/stream
 
-with open("smaller-new-file.pdf", "wb") as fp:
-    writer.write(fp)
-```
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -44,7 +44,7 @@ def __post_init__(self) -> None:
                 self.font_dictionary["/DescendantFonts"]
             ):
                 while isinstance(d_font, IndirectObject):
-                    d_font = d_font.get_object()  # type: ignore[assignment]
+                    d_font = d_font.get_object()
                 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
                 ord_map = {
                     ord(_target): _surrogate
@@ -75,20 +75,32 @@ def __post_init__(self) -> None:
                             {
                                 ord_map[_cidx]: _width
                                 for _cidx, _width in zip(
-                                    range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
+                                    range(
+                                        cast(int, start_idx),
+                                        cast(int, start_idx) + len(width_list),
+                                        1,
+                                    ),
                                     width_list,
                                 )
                                 if _cidx in ord_map
                             }
                         )
                         skip_count = 1
                     # check for format (2): `int int int`
-                    elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
-                        start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
+                    elif isinstance(w_next_entry, (int, float)) and isinstance(
+                        _w[idx + 2].get_object(), (int, float)
+                    ):
+                        start_idx, stop_idx, const_width = (
+                            w_entry,
+                            w_next_entry,
+                            _w[idx + 2].get_object(),
+                        )
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: const_width
-                                for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
+                                for _cidx in range(
+                                    cast(int, start_idx), cast(int, stop_idx + 1), 1
+                                )
                                 if _cidx in ord_map
                             }
                         )