From dcd15aaf50faa16a0b361d834e10ebe7c56dfc97 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 25 Sep 2024 11:03:41 +0200 Subject: [PATCH] ENH: Add `fll` parameter to PdfWriter constructor (#2865) Allow to load huge files. Closes #2839. --- pypdf/_writer.py | 17 ++++-- pypdf/generic/_base.py | 22 ++++++++ pypdf/generic/_data_structures.py | 86 ++++++++++++++++++++++++++++++- tests/test_generic.py | 40 ++++++++++++++ tests/test_writer.py | 5 +- 5 files changed, 163 insertions(+), 7 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 67fbf67de..5852e13cf 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -156,8 +156,12 @@ class PdfWriter(PdfDocCommon): incremental: If true, loads the document and set the PdfWriter in incremental mode. + When writing incrementally, the original document is written first and new/modified content is appended. To be used for signed document/forms to keep signature valid. + + full: If true, loads all the objects (always full if incremental = True). + This parameters may allows to load very big PDFs. """ def __init__( @@ -165,8 +169,9 @@ def __init__( fileobj: Union[None, PdfReader, StrByteType, Path] = "", clone_from: Union[None, PdfReader, StrByteType, Path] = None, incremental: bool = False, + full: bool = False, ) -> None: - self.incremental = incremental + self.incremental = incremental or full """ Returns if the PdfWriter object has been started in incremental mode. """ @@ -203,7 +208,7 @@ def __init__( fileobj = BytesIO(f.read(-1)) if isinstance(fileobj, BytesIO): fileobj = PdfReader(fileobj) - else: + if not isinstance(fileobj, PdfReader): raise PyPdfError("Invalid type for incremental mode") self._reader = fileobj # prev content is in _reader.stream self._header = fileobj.pdf_header.encode() @@ -273,6 +278,8 @@ def _get_clone_from( } ) self._add_object(self._root_object) + if full and not incremental: + self.incremental = False if isinstance(self._ID, list): if isinstance(self._ID[0], TextStringObject): self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) @@ -1177,11 +1184,15 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: Args: reader: PdfReader from which the document root should be copied. """ + self._info_obj = None if self.incremental: self._objects = [None] * cast(int, reader.trailer["/Size"]) + for i in range(len(self._objects) - 1): + o = reader.get_object(i + 1) + if o is not None: + self._objects[i] = o.replicate(self) else: self._objects.clear() - self._info_obj = None self._root_object = reader.root_object.clone(self) self._pages = self._root_object.raw_get("/Pages") diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index e05e00b39..77caa4736 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -82,6 +82,22 @@ def hash_value(self) -> bytes: ) ).encode() + def replicate( + self, + pdf_dest: PdfWriterProtocol, + ) -> "PdfObject": + """ + Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) + without ensuring links. This is used in clone_document_from_root with incremental = True. + + Args: + pdf_dest: Target to clone to. + + Returns: + The cloned PdfObject + """ + return self.clone(pdf_dest) + def clone( self, pdf_dest: PdfWriterProtocol, @@ -298,6 +314,12 @@ def hash_bin(self) -> int: """ return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) + def replicate( + self, + pdf_dest: PdfWriterProtocol, + ) -> "PdfObject": + return IndirectObject(self.idnum, self.generation, pdf_dest) + def clone( self, pdf_dest: PdfWriterProtocol, diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 17f5fae27..aba8326a1 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -102,6 +102,21 @@ class ArrayObject(List[Any], PdfObject): + def replicate( + self, + pdf_dest: PdfWriterProtocol, + ) -> "ArrayObject": + arr = cast( + "ArrayObject", + self._reference_clone(ArrayObject(), pdf_dest, False), + ) + for data in self: + if hasattr(data, "replicate"): + arr.append(data.replicate(pdf_dest)) + else: + arr.append(data) + return arr + def clone( self, pdf_dest: PdfWriterProtocol, @@ -248,6 +263,20 @@ def read_from_stream( class DictionaryObject(Dict[Any, Any], PdfObject): + def replicate( + self, + pdf_dest: PdfWriterProtocol, + ) -> "DictionaryObject": + d__ = cast( + "DictionaryObject", + self._reference_clone(self.__class__(), pdf_dest, False), + ) + for k, v in self.items(): + d__[k.replicate(pdf_dest)] = ( + v.replicate(pdf_dest) if hasattr(v, "replicate") else v + ) + return d__ + def clone( self, pdf_dest: PdfWriterProtocol, @@ -864,6 +893,31 @@ def __init__(self) -> None: self._data: bytes = b"" self.decoded_self: Optional[DecodedStreamObject] = None + def replicate( + self, + pdf_dest: PdfWriterProtocol, + ) -> "StreamObject": + d__ = cast( + "StreamObject", + self._reference_clone(self.__class__(), pdf_dest, False), + ) + d__._data = self._data + try: + decoded_self = self.decoded_self + if decoded_self is None: + self.decoded_self = None + else: + self.decoded_self = cast( + "DecodedStreamObject", decoded_self.replicate(pdf_dest) + ) + except Exception: + pass + for k, v in self.items(): + d__[k.replicate(pdf_dest)] = ( + v.replicate(pdf_dest) if hasattr(v, "replicate") else v + ) + return d__ + def _clone( self, src: DictionaryObject, @@ -1105,7 +1159,37 @@ def __init__( stream_data = stream.get_data() assert stream_data is not None super().set_data(stream_data) - self.forced_encoding = forced_encoding + self.forced_encoding = forced_encoding + + def replicate( + self, + pdf_dest: PdfWriterProtocol, + ) -> "ContentStream": + d__ = cast( + "ContentStream", + self._reference_clone(self.__class__(None, None), pdf_dest, False), + ) + d__._data = self._data + try: + decoded_self = self.decoded_self + if decoded_self is None: + self.decoded_self = None + else: + self.decoded_self = cast( + "DecodedStreamObject", decoded_self.replicate(pdf_dest) + ) + except Exception: + pass + for k, v in self.items(): + d__[k.replicate(pdf_dest)] = ( + v.replicate(pdf_dest) if hasattr(v, "replicate") else v + ) + return d__ + d__.set_data(self._data) + d__.pdf = pdf_dest + d__._operations = list(self._operations) + d__.forced_encoding = self.forced_encoding + return d__ def clone( self, diff --git a/tests/test_generic.py b/tests/test_generic.py index d5fad26d7..369ad7911 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1155,3 +1155,43 @@ def test_is_null_or_none(): writer = PdfWriter(reader) writer.pages[0]["/Contents"].append(writer._add_object(NullObject())) assert is_null_or_none(writer.pages[0]["/Contents"][-1]) + + +def test_coverage_arrayobject(): + writer = PdfWriter() + a = ArrayObject([1]) + assert isinstance(a.replicate(writer)[0], int) + assert isinstance(a.clone(writer)[0], int) + a.indirect_reference = IndirectObject(1, 0, writer) + assert isinstance(a.clone(writer)[0], int) + r = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + a = ArrayObject([r.pages[0]["/Contents"][0].get_object()]) + aa = a.clone(writer) + assert isinstance(aa[0], IndirectObject) + for k, v in aa.items(): + assert isinstance(k, int) + assert isinstance(v, PdfObject) + + +def test_coverage_streamobject(): + writer = PdfWriter() + s = StreamObject() + del s.decoded_self + s.replicate(writer) + s.clone(writer) + + co = ContentStream(None, None) + co.replicate(writer) + co.clone(writer, False, None) + co.indirect_reference = IndirectObject(1, 0, writer) + assert co == co.clone(writer) + + r = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + co = r.pages[0].get_contents() + co[NameObject("/testkey")] = NameObject("/test") + co.decoded_self = None + assert "/testkey" in co.replicate(writer) + co = r.pages[0].get_contents() + co[NameObject("/testkey")] = NameObject("/test") + co.decoded_self = DecodedStreamObject() + assert "/testkey" in co.replicate(writer) diff --git a/tests/test_writer.py b/tests/test_writer.py index 178eb2408..d422cd69c 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1140,8 +1140,7 @@ def test_set_page_label(pdf_file_path): # Tests full length with labels assigned at first and last elements # Tests different labels assigned to consecutive ranges - writer = PdfWriter() - writer.clone_document_from_reader(reader) + writer = PdfWriter(reader, full=True) writer.set_page_label(0, 1, "/r") writer.set_page_label(4, 5, "/A") writer.set_page_label(10, 10, "/A") @@ -2428,7 +2427,7 @@ def test_increment_writer(caplog): ) assert "/ForTestOnly" in reader.get_object(5) with pytest.raises(PyPdfError): - writer = PdfWriter(reader, incremental=True) + writer = PdfWriter(1, incremental=True) b.seek(0) writer = PdfWriter(b, incremental=True) assert writer.list_objects_in_increment() == [] # no flowdown of properties