From 8d30c88a95113fcc37938ae45e8bb03ee5d76442 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:25:34 +0200 Subject: [PATCH 01/17] ENH: compress pdf files merging identical objects add compress_identical_objects() discovered in #2728 closes #2794 closes #2768 --- pypdf/_writer.py | 233 +++++++++++++++++++++-------------------- pypdf/generic/_base.py | 3 + 2 files changed, 122 insertions(+), 114 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 00b9d498c..93c474db2 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -27,11 +27,11 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import collections import decimal import enum import hashlib import re +import sys import uuid from io import BytesIO, FileIO, IOBase from pathlib import Path @@ -40,7 +40,6 @@ IO, Any, Callable, - Deque, Dict, Iterable, List, @@ -157,12 +156,17 @@ def __init__( clone_from: Union[None, PdfReader, StrByteType, Path] = None, ) -> None: self._header = b"%PDF-1.3" - self._objects: List[PdfObject] = [] + self._objects: List[Optional[PdfObject]] = [] """The indirect objects in the PDF.""" - self._idnum_hash: Dict[bytes, IndirectObject] = {} - """Maps hash values of indirect objects to their IndirectObject instances.""" + """Maps hash values of indirect objects to the list of IndirectObjects. + This is used for compression + """ + self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} + """list of translation already done. + dict[id(pdf)][(idnum, generation)] + """ self._id_translated: Dict[int, Dict[int, int]] = {} # The root of our page tree node. @@ -371,10 +375,13 @@ def get_object( indirect_reference: Union[int, IndirectObject], ) -> PdfObject: if isinstance(indirect_reference, int): - return self._objects[indirect_reference - 1] - if indirect_reference.pdf != self: + obj = self._objects[indirect_reference - 1] + elif indirect_reference.pdf != self: raise ValueError("pdf must be self") - return self._objects[indirect_reference.idnum - 1] + else: + obj = self._objects[indirect_reference.idnum - 1] + assert obj is not None + return obj def _replace_object( self, @@ -393,6 +400,7 @@ def _replace_object( obj = obj.clone(self) self._objects[indirect_reference - 1] = obj obj.indirect_reference = IndirectObject(indirect_reference, gen, self) + assert obj is None return self._objects[indirect_reference - 1] def _add_page( @@ -1246,10 +1254,10 @@ def write_stream(self, stream: StreamType) -> None: if not self._root: self._root = self._add_object(self._root_object) - self._sweep_indirect_references(self._root) + # no more used : self._sweep_indirect_references(self._root) - object_positions = self._write_pdf_structure(stream) - xref_location = self._write_xref_table(stream, object_positions) + object_positions, free_objects = self._write_pdf_structure(stream) + xref_location = self._write_xref_table(stream, object_positions, free_objects) self._write_trailer(stream, xref_location) def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: @@ -1282,8 +1290,9 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: return my_file, stream - def _write_pdf_structure(self, stream: StreamType) -> List[int]: + def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: object_positions = [] + free_objects = [] # will contain list of all free entries stream.write(self.pdf_header.encode() + b"\n") stream.write(b"%\xE2\xE3\xCF\xD3\n") @@ -1296,15 +1305,26 @@ def _write_pdf_structure(self, stream: StreamType) -> List[int]: obj = self._encryption.encrypt_object(obj, idnum, 0) obj.write_to_stream(stream) stream.write(b"\nendobj\n") - return object_positions - - def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int: + else: + object_positions.append(-1) + free_objects.append(i + 1) + free_objects.append(0) # add 0 to loop iaw PDF spec + return object_positions, free_objects + + def _write_xref_table( + self, stream: StreamType, object_positions: List[int], free_objects: List[int] + ) -> int: xref_location = stream.tell() stream.write(b"xref\n") stream.write(f"0 {len(self._objects) + 1}\n".encode()) - stream.write(f"{0:0>10} {65535:0>5} f \n".encode()) + stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode()) + free_idx = 1 for offset in object_positions: - stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) + if offset > 0: + stream.write(f"{offset:0>10} {0:0>5} n \n".encode()) + else: + stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode()) + free_idx += 1 return xref_location def _write_trailer(self, stream: StreamType, xref_location: int) -> None: @@ -1349,6 +1369,73 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: assert isinstance(self._info, DictionaryObject) self._info.update(args) + def compress_identical_objects(self, verbose: Union[int, bool] = -1) -> None: + """ + Parse the Pdf file and merge objects that have same harsh. + This will make objects common to multiple pages + Recommended to be used just before writing output + + Args: + verbose: provide some progress information. + int : frequence of progress update; disable if negative + bool : True => 100 ; False = -1 + """ + + def replace_in_obj( + obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] + ) -> None: + if isinstance(obj, DictionaryObject): + key_val = obj.items() + elif isinstance(obj, ArrayObject): + key_val = enumerate(obj) # type: ignore + else: + return + assert isinstance(obj, (DictionaryObject, ArrayObject)) + for k, v in key_val: + if isinstance(v, IndirectObject) and v in crossref: + obj[k] = crossref[v] + else: # if isinstance(v, (DictionaryObject, ArrayObject)): + replace_in_obj(v, crossref) + + # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) + self._idnum_hash = {} + if isinstance(verbose, int): + cpt_init = verbose + else: + cpt_init = 100 if verbose else -1 + cpt = cpt_init + # look for similar objects + for idx, obj in enumerate(self._objects): + if obj is None: + continue + assert isinstance(obj.indirect_reference, IndirectObject) + h = obj.hash_value() + if cpt == 0: + print("+", end="", file=sys.stderr) # noqa: T201 + cpt = cpt_init + cpt -= 1 + if h in self._idnum_hash: + self._idnum_hash[h][1].append(obj.indirect_reference) + self._objects[idx] = None + else: + self._idnum_hash[h] = (obj.indirect_reference, []) + + # generate the dict converting others to 1st + cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} + cnv_rev: Dict[IndirectObject, IndirectObject] = {} + for k, v in cnv.items(): + cnv_rev.update(zip(v, (k,) * len(v))) + cpt = cpt_init + + # replace reference to merged objects + for obj in self._objects: + if isinstance(obj, (DictionaryObject, ArrayObject)): + if cpt == 0: + print(".", end="", file=sys.stderr) # noqa: T201 + cpt = cpt_init + cpt -= 1 + replace_in_obj(obj, cnv_rev) + def _sweep_indirect_references( self, root: Union[ @@ -1363,7 +1450,7 @@ def _sweep_indirect_references( TextStringObject, NullObject, ], - ) -> None: + ) -> None: # deprecated """ Resolving any circular references to Page objects. @@ -1379,73 +1466,15 @@ def _sweep_indirect_references( Args: root: The root of the PDF object tree to sweep. """ - stack: Deque[ - Tuple[ - Any, - Optional[Any], - Any, - List[PdfObject], - ] - ] = collections.deque() - discovered = [] - parent = None - grant_parents: List[PdfObject] = [] - key_or_id = None - - # Start from root - stack.append((root, parent, key_or_id, grant_parents)) - - while len(stack): - data, parent, key_or_id, grant_parents = stack.pop() - - # Build stack for a processing depth-first - if isinstance(data, (ArrayObject, DictionaryObject)): - for key, value in data.items(): - stack.append( - ( - value, - data, - key, - grant_parents + [parent] if parent is not None else [], - ) - ) - elif isinstance(data, IndirectObject) and data.pdf != self: - data = self._resolve_indirect_object(data) - - if str(data) not in discovered: - discovered.append(str(data)) - stack.append((data.get_object(), None, None, [])) - - # Check if data has a parent and if it is a dict or - # an array update the value - if isinstance(parent, (DictionaryObject, ArrayObject)): - if isinstance(data, StreamObject): - # a dictionary value is a stream; streams must be indirect - # objects, so we need to change this value. - data = self._resolve_indirect_object(self._add_object(data)) - - update_hashes = [] - - # Data changed and thus the hash value changed - if parent[key_or_id] != data: - update_hashes = [parent.hash_value()] + [ - grant_parent.hash_value() for grant_parent in grant_parents - ] - parent[key_or_id] = data - - # Update old hash value to new hash value - for old_hash in update_hashes: - indirect_reference = self._idnum_hash.pop(old_hash, None) - - if indirect_reference is not None: - indirect_reference_obj = indirect_reference.get_object() - - if indirect_reference_obj is not None: - self._idnum_hash[ - indirect_reference_obj.hash_value() - ] = indirect_reference + deprecate_with_replacement( + "_sweep_indirect_references", + "no replacement, please report to dev team if this warning is observed", + "5.0.0", + ) - def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: + def _resolve_indirect_object( + self, data: IndirectObject + ) -> IndirectObject: # deprecated """ Resolves an indirect object to an indirect object in this PDF file. @@ -1470,36 +1499,12 @@ def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: Raises: ValueError: If the input stream is closed. """ - if hasattr(data.pdf, "stream") and data.pdf.stream.closed: - raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}") - - if data.pdf == self: - return data - - # Get real object indirect object - real_obj = data.pdf.get_object(data) - - if real_obj is None: - logger_warning( - f"Unable to resolve [{data.__class__.__name__}: {data}], " - "returning NullObject instead", - __name__, - ) - real_obj = NullObject() - - hash_value = real_obj.hash_value() - - # Check if object is handled - if hash_value in self._idnum_hash: - return self._idnum_hash[hash_value] - - if data.pdf == self: - self._idnum_hash[hash_value] = IndirectObject(data.idnum, 0, self) - # This is new object in this pdf - else: - self._idnum_hash[hash_value] = self._add_object(real_obj) - - return self._idnum_hash[hash_value] + deprecate_with_replacement( + "_resolve_indirect_object", + "no replacement, please report to dev team if this warning is observed", + "5.0.0", + ) + return IndirectObject(0, 0, self) def get_reference(self, obj: PdfObject) -> IndirectObject: idnum = self._objects.index(obj) + 1 diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 2d606b418..35ce956cc 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -240,6 +240,9 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader self.generation = generation self.pdf = pdf + def __hash__(self) -> int: + return hash((self.idnum, self.generation, id(self.pdf))) + def clone( self, pdf_dest: PdfWriterProtocol, From 86fd7c713fca5b676098504c565dd09965e9463a Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:28:08 +0200 Subject: [PATCH 02/17] pre-commit error reported --- pypdf/_text_extraction/_layout_mode/_font.py | 22 +++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index 40655b1b2..1d9617d74 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -44,7 +44,7 @@ def __post_init__(self) -> None: self.font_dictionary["/DescendantFonts"] ): while isinstance(d_font, IndirectObject): - d_font = d_font.get_object() # type: ignore[assignment] + d_font = d_font.get_object() self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font ord_map = { ord(_target): _surrogate @@ -75,7 +75,11 @@ def __post_init__(self) -> None: { ord_map[_cidx]: _width for _cidx, _width in zip( - range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1), + range( + cast(int, start_idx), + cast(int, start_idx) + len(width_list), + 1, + ), width_list, ) if _cidx in ord_map @@ -83,12 +87,20 @@ def __post_init__(self) -> None: ) skip_count = 1 # check for format (2): `int int int` - elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)): - start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object() + elif isinstance(w_next_entry, (int, float)) and isinstance( + _w[idx + 2].get_object(), (int, float) + ): + start_idx, stop_idx, const_width = ( + w_entry, + w_next_entry, + _w[idx + 2].get_object(), + ) self.width_map.update( { ord_map[_cidx]: const_width - for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1) + for _cidx in range( + cast(int, start_idx), cast(int, stop_idx + 1), 1 + ) if _cidx in ord_map } ) From f91f131c581bc1015232ae68931495249738d747 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 11 Aug 2024 20:46:22 +0200 Subject: [PATCH 03/17] oups --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 93c474db2..7c4cbbd96 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -400,8 +400,8 @@ def _replace_object( obj = obj.clone(self) self._objects[indirect_reference - 1] = obj obj.indirect_reference = IndirectObject(indirect_reference, gen, self) - assert obj is None - return self._objects[indirect_reference - 1] + assert isinstance(obj, IndirectObject) + return obj def _add_page( self, From db67c0b6eb8a39f975de2decbc9a205c36bb0185 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:39:43 +0200 Subject: [PATCH 04/17] doc --- docs/user/file-size.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 0ee72e37e..4908bda08 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -9,23 +9,24 @@ Some PDF documents contain the same object multiple times. For example, if an image appears three times in a PDF it could be embedded three times. Or it can be embedded once and referenced twice. -This can be done by reading and writing the file: +When adding data to a PdfWriter, the data are copied respecting the original format. +For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object -```python -from pypdf import PdfReader, PdfWriter +Also when you delete objects in a document, pypdf can not easily identify weither the object is used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within. -reader = PdfReader("big-old-file.pdf") -writer = PdfWriter() +in order to reduce the file size a compression process: +`writer.compress_identical_objects(remove_identical: byte = True, remove_orphans:byte = True, verbose: int = -1)` -for page in reader.pages: - writer.add_page(page) +`remove_identical` enables / disables compression merging identical objects +`remove_orphans` enables / disables suppression of unused objects +`verbose` sets the value on how many objects are processed +the progress status (printed on stderr) of the compression is printed as follow: + '+' during initial loop + '.' when removing duplicates + '\*' when removing hidden objects -if reader.metadata is not None: - writer.add_metadata(reader.metadata) +It is recommended to apply this process just before writing to file/stream -with open("smaller-new-file.pdf", "wb") as fp: - writer.write(fp) -``` It depends on the PDF how well this works, but we have seen an 86% file reduction (from 5.7 MB to 0.8 MB) within a real PDF. From 5bad76c2f2bc0957c95f83cd215df55a2c41d517 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 11 Aug 2024 22:38:46 +0200 Subject: [PATCH 05/17] WIP on iss2794 --- pypdf/_writer.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 7c4cbbd96..4d1e53401 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -62,6 +62,7 @@ StreamType, _get_max_pdf_version_header, b_, + deprecate, deprecate_with_replacement, logger_warning, ) @@ -400,7 +401,8 @@ def _replace_object( obj = obj.clone(self) self._objects[indirect_reference - 1] = obj obj.indirect_reference = IndirectObject(indirect_reference, gen, self) - assert isinstance(obj, IndirectObject) + + assert isinstance(obj, PdfObject) return obj def _add_page( @@ -1466,10 +1468,8 @@ def _sweep_indirect_references( Args: root: The root of the PDF object tree to sweep. """ - deprecate_with_replacement( - "_sweep_indirect_references", - "no replacement, please report to dev team if this warning is observed", - "5.0.0", + deprecate( + "_sweep_indirect_references has been removed, please report to dev team if this warning is observed", ) def _resolve_indirect_object( @@ -1499,10 +1499,8 @@ def _resolve_indirect_object( Raises: ValueError: If the input stream is closed. """ - deprecate_with_replacement( - "_resolve_indirect_object", - "no replacement, please report to dev team if this warning is observed", - "5.0.0", + deprecate( + "_resolve_indirect_object has been removed, please report to dev team if this warning is observed", ) return IndirectObject(0, 0, self) From bd4b67297c8f18d3d15016ca4caf158c8fadfed1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 00:48:56 +0200 Subject: [PATCH 06/17] add remove_orphans + test --- pypdf/_writer.py | 50 ++++++++++++++++++++++++++++++++------------ tests/test_writer.py | 23 ++++++++++++++++++++ 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 4d1e53401..4bd71482a 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -34,6 +34,7 @@ import sys import uuid from io import BytesIO, FileIO, IOBase +from itertools import compress from pathlib import Path from types import TracebackType from typing import ( @@ -1252,11 +1253,10 @@ def write_stream(self, stream: StreamType) -> None: "It may not be written to correctly.", __name__, ) - - if not self._root: - self._root = self._add_object(self._root_object) - - # no more used : self._sweep_indirect_references(self._root) + # no more used : + # if not self._root: + # self._root = self._add_object(self._root_object) + # self._sweep_indirect_references(self._root) object_positions, free_objects = self._write_pdf_structure(stream) xref_location = self._write_xref_table(stream, object_positions, free_objects) @@ -1371,16 +1371,22 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: assert isinstance(self._info, DictionaryObject) self._info.update(args) - def compress_identical_objects(self, verbose: Union[int, bool] = -1) -> None: + def compress_identical_objects( + self, + remove_identicals: bool = True, + remove_orphans: bool = True, + verbose: int = -1, + ) -> None: """ Parse the Pdf file and merge objects that have same harsh. This will make objects common to multiple pages Recommended to be used just before writing output Args: - verbose: provide some progress information. - int : frequence of progress update; disable if negative - bool : True => 100 ; False = -1 + remove_identicals: remove of identical objects + remove_orphans: remove of unreferenced objects + verbose: frequence of progress update; <0 => disable + """ def replace_in_obj( @@ -1394,9 +1400,13 @@ def replace_in_obj( return assert isinstance(obj, (DictionaryObject, ArrayObject)) for k, v in key_val: - if isinstance(v, IndirectObject) and v in crossref: - obj[k] = crossref[v] - else: # if isinstance(v, (DictionaryObject, ArrayObject)): + if isinstance(v, IndirectObject): + orphans[v.idnum - 1] = False + if v in crossref: + obj[k] = crossref[v] + else: + """the filtering on DictionaryObject and ArrayObject only + will be performed within replace_in_obj""" replace_in_obj(v, crossref) # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) @@ -1406,6 +1416,7 @@ def replace_in_obj( else: cpt_init = 100 if verbose else -1 cpt = cpt_init + orphans = [True] * len(self._objects) # look for similar objects for idx, obj in enumerate(self._objects): if obj is None: @@ -1416,7 +1427,7 @@ def replace_in_obj( print("+", end="", file=sys.stderr) # noqa: T201 cpt = cpt_init cpt -= 1 - if h in self._idnum_hash: + if remove_identicals and h in self._idnum_hash: self._idnum_hash[h][1].append(obj.indirect_reference) self._objects[idx] = None else: @@ -1438,6 +1449,19 @@ def replace_in_obj( cpt -= 1 replace_in_obj(obj, cnv_rev) + # remove orphans (if applicable) + orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore + try: + orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore + except Exception: + pass + try: + orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore + except Exception: + pass + for i in compress(range(len(self._objects)), orphans): + self._objects[i] = None + def _sweep_indirect_references( self, root: Union[ diff --git a/tests/test_writer.py b/tests/test_writer.py index 9dfeffdd8..c06fede08 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2300,3 +2300,26 @@ def test_matrix_entry_in_field_annots(): auto_regenerate=False, ) assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"] + + +@pytest.mark.enable_socket() +def test_compress_identical_objects(): + """Cf #2728 and #2794""" + url = "https://github.com/user-attachments/files/16575458/tt2.pdf" + name = "iss2794.pdf" + in_bytes = BytesIO(get_data_from_url(url, name=name)) + writer = PdfWriter(in_bytes) + writer.compress_identical_objects(remove_orphans=False, verbose=100) + out1 = BytesIO() + writer.write(out1) + assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue()) + writer.remove_page( + 1 + ) # page0 contains fields which keep reference to the deleted page + out2 = BytesIO() + writer.write(out2) + assert len(out1.getvalue()) - 100 < len(out2.getvalue()) + writer.compress_identical_objects(remove_identicals=False, verbose=100) + out3 = BytesIO() + writer.write(out3) + assert len(out2.getvalue()) > len(out3.getvalue()) From 380233bfac83ad565e72756a889007c96cc37b7d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 01:41:27 +0200 Subject: [PATCH 07/17] doc + coverage --- docs/user/file-size.md | 11 ++++++----- pypdf/_writer.py | 6 +++--- tests/test_writer.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 4908bda08..96ce6fab5 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -12,18 +12,19 @@ be embedded once and referenced twice. When adding data to a PdfWriter, the data are copied respecting the original format. For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object -Also when you delete objects in a document, pypdf can not easily identify weither the object is used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within. +Also when you delete objects in a document, pypdf can not easily identify weither the objects are used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within. in order to reduce the file size a compression process: -`writer.compress_identical_objects(remove_identical: byte = True, remove_orphans:byte = True, verbose: int = -1)` +`writer.compress_identical_objects(remove_identicals = True, remove_orphans= True, verbose = -1)` `remove_identical` enables / disables compression merging identical objects + `remove_orphans` enables / disables suppression of unused objects + `verbose` sets the value on how many objects are processed the progress status (printed on stderr) of the compression is printed as follow: - '+' during initial loop - '.' when removing duplicates - '\*' when removing hidden objects +* '+' during initial loop +* '.' when replacing duplicates It is recommended to apply this process just before writing to file/stream diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 4bd71482a..65a1d96df 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1414,7 +1414,7 @@ def replace_in_obj( if isinstance(verbose, int): cpt_init = verbose else: - cpt_init = 100 if verbose else -1 + cpt_init = -1 cpt = cpt_init orphans = [True] * len(self._objects) # look for similar objects @@ -1453,8 +1453,8 @@ def replace_in_obj( orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore try: orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore - except Exception: - pass + except Exception: # pragma: no cover + pass # pragma: no cover try: orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore except Exception: diff --git a/tests/test_writer.py b/tests/test_writer.py index c06fede08..9ecaa25ca 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2319,7 +2319,7 @@ def test_compress_identical_objects(): out2 = BytesIO() writer.write(out2) assert len(out1.getvalue()) - 100 < len(out2.getvalue()) - writer.compress_identical_objects(remove_identicals=False, verbose=100) + writer.compress_identical_objects(remove_identicals=False, verbose="fake") out3 = BytesIO() writer.write(out3) assert len(out2.getvalue()) > len(out3.getvalue()) From 33c0d47748a49a9f22f8a7038b6c73d104cf6068 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:05:59 +0200 Subject: [PATCH 08/17] improve wording --- docs/user/file-size.md | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 96ce6fab5..8a103d96a 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -9,25 +9,23 @@ Some PDF documents contain the same object multiple times. For example, if an image appears three times in a PDF it could be embedded three times. Or it can be embedded once and referenced twice. -When adding data to a PdfWriter, the data are copied respecting the original format. -For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object +When adding data to a PdfWriter, the data is copied while respecting the original format. +For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object. -Also when you delete objects in a document, pypdf can not easily identify weither the objects are used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within. +Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed). -in order to reduce the file size a compression process: -`writer.compress_identical_objects(remove_identicals = True, remove_orphans= True, verbose = -1)` +In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True, verbose=-1)` -`remove_identical` enables / disables compression merging identical objects +* `remove_identicals` enables/disables compression merging identical objects. +* `remove_orphans` enables/disables suppression of unused objects. +* `verbose` sets the value on how many objects are processed. -`remove_orphans` enables / disables suppression of unused objects +The progress status (printed on stderr) of the compression is printed as follows: -`verbose` sets the value on how many objects are processed -the progress status (printed on stderr) of the compression is printed as follow: -* '+' during initial loop -* '.' when replacing duplicates - -It is recommended to apply this process just before writing to file/stream +* `'+'` during initial loop +* `'.'` when replacing duplicates +It is recommended to apply this process just before writing to file/stream. It depends on the PDF how well this works, but we have seen an 86% file reduction (from 5.7 MB to 0.8 MB) within a real PDF. From 661ae6403d0671a7b96580a9d158dd50f8461df6 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:47:12 +0200 Subject: [PATCH 09/17] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 65a1d96df..66b91af07 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1310,7 +1310,7 @@ def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int] else: object_positions.append(-1) free_objects.append(i + 1) - free_objects.append(0) # add 0 to loop iaw PDF spec + free_objects.append(0) # add 0 to loop in accordance with PDF spec return object_positions, free_objects def _write_xref_table( From e51ba8ca5526d49c7cb54413af83d6c637c3be66 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 19:54:08 +0200 Subject: [PATCH 10/17] from review --- pypdf/_writer.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 66b91af07..d1a33bc81 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -382,7 +382,7 @@ def get_object( raise ValueError("pdf must be self") else: obj = self._objects[indirect_reference.idnum - 1] - assert obj is not None + assert obj is not None # clarification for mypy return obj def _replace_object( @@ -403,7 +403,7 @@ def _replace_object( self._objects[indirect_reference - 1] = obj obj.indirect_reference = IndirectObject(indirect_reference, gen, self) - assert isinstance(obj, PdfObject) + assert isinstance(obj, PdfObject) # clarification for mypy return obj def _add_page( @@ -1253,7 +1253,7 @@ def write_stream(self, stream: StreamType) -> None: "It may not be written to correctly.", __name__, ) - # no more used : + # deprecated to be removed in pypdf 6.0.0 : # if not self._root: # self._root = self._add_object(self._root_object) # self._sweep_indirect_references(self._root) @@ -1375,18 +1375,15 @@ def compress_identical_objects( self, remove_identicals: bool = True, remove_orphans: bool = True, - verbose: int = -1, ) -> None: """ - Parse the Pdf file and merge objects that have same harsh. + Parse the PDF file and merge objects that have same hash. This will make objects common to multiple pages Recommended to be used just before writing output Args: remove_identicals: remove of identical objects remove_orphans: remove of unreferenced objects - verbose: frequence of progress update; <0 => disable - """ def replace_in_obj( @@ -1411,11 +1408,6 @@ def replace_in_obj( # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) self._idnum_hash = {} - if isinstance(verbose, int): - cpt_init = verbose - else: - cpt_init = -1 - cpt = cpt_init orphans = [True] * len(self._objects) # look for similar objects for idx, obj in enumerate(self._objects): @@ -1423,10 +1415,6 @@ def replace_in_obj( continue assert isinstance(obj.indirect_reference, IndirectObject) h = obj.hash_value() - if cpt == 0: - print("+", end="", file=sys.stderr) # noqa: T201 - cpt = cpt_init - cpt -= 1 if remove_identicals and h in self._idnum_hash: self._idnum_hash[h][1].append(obj.indirect_reference) self._objects[idx] = None @@ -1443,21 +1431,17 @@ def replace_in_obj( # replace reference to merged objects for obj in self._objects: if isinstance(obj, (DictionaryObject, ArrayObject)): - if cpt == 0: - print(".", end="", file=sys.stderr) # noqa: T201 - cpt = cpt_init - cpt -= 1 replace_in_obj(obj, cnv_rev) # remove orphans (if applicable) orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore try: orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore - except Exception: # pragma: no cover - pass # pragma: no cover + except AttributeError: + pass try: orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore - except Exception: + except AttributeError: pass for i in compress(range(len(self._objects)), orphans): self._objects[i] = None From 8d33a6989e055b6309269c1b379631975c698d64 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 19:57:04 +0200 Subject: [PATCH 11/17] aftermath from review --- tests/test_writer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 9ecaa25ca..5be9b091f 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2309,17 +2309,18 @@ def test_compress_identical_objects(): name = "iss2794.pdf" in_bytes = BytesIO(get_data_from_url(url, name=name)) writer = PdfWriter(in_bytes) - writer.compress_identical_objects(remove_orphans=False, verbose=100) + writer.compress_identical_objects(remove_orphans=False) out1 = BytesIO() writer.write(out1) assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue()) writer.remove_page( 1 ) # page0 contains fields which keep reference to the deleted page + writer._info = None out2 = BytesIO() writer.write(out2) assert len(out1.getvalue()) - 100 < len(out2.getvalue()) - writer.compress_identical_objects(remove_identicals=False, verbose="fake") + writer.compress_identical_objects(remove_identicals=False) out3 = BytesIO() writer.write(out3) assert len(out2.getvalue()) > len(out3.getvalue()) From ab2f2aeb184e63ec642cc08c7e82830429a7cf08 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:24:11 +0200 Subject: [PATCH 12/17] fix --- pypdf/_writer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 760cdbbb3..63c96a50f 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -31,7 +31,6 @@ import enum import hashlib import re -import sys import uuid from io import BytesIO, FileIO, IOBase from itertools import compress @@ -62,6 +61,7 @@ StrByteType, StreamType, _get_max_pdf_version_header, + deprecate, deprecate_with_replacement, logger_warning, ) @@ -1425,7 +1425,6 @@ def replace_in_obj( cnv_rev: Dict[IndirectObject, IndirectObject] = {} for k, v in cnv.items(): cnv_rev.update(zip(v, (k,) * len(v))) - cpt = cpt_init # replace reference to merged objects for obj in self._objects: From 892ffd33ee4b3f3236be8e2ccc497b222e261d93 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:10:40 +0200 Subject: [PATCH 13/17] fix --- pypdf/_writer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 63c96a50f..d53fd51dd 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1433,10 +1433,9 @@ def replace_in_obj( # remove orphans (if applicable) orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore - try: - orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore - except AttributeError: - pass + + orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore + try: orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore except AttributeError: From 8a6f3bd25531fa166bc60517ce11a07d12f9a8b8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:26:26 +0200 Subject: [PATCH 14/17] Update file-size.md --- docs/user/file-size.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 8a103d96a..26a5aacf8 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -14,16 +14,10 @@ For example, if two pages include the same image which is duplicated in the sour Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed). -In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True, verbose=-1)` +In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)` * `remove_identicals` enables/disables compression merging identical objects. * `remove_orphans` enables/disables suppression of unused objects. -* `verbose` sets the value on how many objects are processed. - -The progress status (printed on stderr) of the compression is printed as follows: - -* `'+'` during initial loop -* `'.'` when replacing duplicates It is recommended to apply this process just before writing to file/stream. From 361a291d68f5cb500158dd714ab9d107809059ef Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:35:03 +0200 Subject: [PATCH 15/17] Update test_writer.py --- tests/test_writer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index c2b283c65..49fe58538 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2320,7 +2320,6 @@ def test_compress_identical_objects(): writer.remove_page( 1 ) # page0 contains fields which keep reference to the deleted page - writer._info = None out2 = BytesIO() writer.write(out2) assert len(out1.getvalue()) - 100 < len(out2.getvalue()) @@ -2329,7 +2328,7 @@ def test_compress_identical_objects(): writer.write(out3) assert len(out2.getvalue()) > len(out3.getvalue()) - + def test_set_need_appearances_writer(): """Minimal test for coverage""" writer = PdfWriter() From 185bddc3643b7d577b1b250ef74318146a8d73be Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:56:20 +0200 Subject: [PATCH 16/17] Update docs/user/file-size.md --- docs/user/file-size.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 26a5aacf8..d47ddcc0e 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -19,7 +19,7 @@ In order to reduce the file size, use a compression call: `writer.compress_ident * `remove_identicals` enables/disables compression merging identical objects. * `remove_orphans` enables/disables suppression of unused objects. -It is recommended to apply this process just before writing to file/stream. +It is recommended to apply this process just before writing to the file/stream. It depends on the PDF how well this works, but we have seen an 86% file reduction (from 5.7 MB to 0.8 MB) within a real PDF. From 6b5e305da02efa05ae8abe6b386a61a40249f67f Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:00:12 +0200 Subject: [PATCH 17/17] improve docs --- pypdf/_writer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index d53fd51dd..a72e2a23d 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -160,11 +160,11 @@ def __init__( """The indirect objects in the PDF.""" """Maps hash values of indirect objects to the list of IndirectObjects. - This is used for compression + This is used for compression. """ self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} - """list of translation already done. + """List of already translated IDs. dict[id(pdf)][(idnum, generation)] """ self._id_translated: Dict[int, Dict[int, int]] = {} @@ -1377,12 +1377,12 @@ def compress_identical_objects( ) -> None: """ Parse the PDF file and merge objects that have same hash. - This will make objects common to multiple pages - Recommended to be used just before writing output + This will make objects common to multiple pages. + Recommended to be used just before writing output. Args: - remove_identicals: remove of identical objects - remove_orphans: remove of unreferenced objects + remove_identicals: Remove identical objects. + remove_orphans: Remove unreferenced objects. """ def replace_in_obj(