From fba73a47fc08a28b6b7d013104e2d322039e9cae Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 23 Aug 2024 23:05:37 +0200 Subject: [PATCH 01/40] ENH: add incremental capability to PdfWriter closes #2780 --- pypdf/_doc_common.py | 22 ++- pypdf/_page.py | 12 ++ pypdf/_protocols.py | 3 + pypdf/_reader.py | 2 + pypdf/_writer.py | 230 +++++++++++++++++++++++------- pypdf/constants.py | 7 +- pypdf/generic/_base.py | 90 +++++++++++- pypdf/generic/_data_structures.py | 26 ++++ tests/test_reader.py | 2 +- 9 files changed, 336 insertions(+), 58 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 4f607340d..12848fb8e 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -254,6 +254,8 @@ class PdfDocCommon: _encryption: Optional[Encryption] = None + _readonly: bool = False + @property @abstractmethod def root_object(self) -> DictionaryObject: @@ -349,7 +351,7 @@ def get_num_pages(self) -> int: return self.root_object["/Pages"]["/Count"] # type: ignore else: if self.flattened_pages is None: - self._flatten() + self._flatten(self._readonly) assert self.flattened_pages is not None return len(self.flattened_pages) @@ -366,7 +368,7 @@ def get_page(self, page_number: int) -> PageObject: A :class:`PageObject` instance. """ if self.flattened_pages is None: - self._flatten() + self._flatten(self._readonly) assert self.flattened_pages is not None, "hint for mypy" return self.flattened_pages[page_number] @@ -1082,10 +1084,19 @@ def page_mode(self) -> Optional[PagemodeType]: def _flatten( self, + list_only: bool = False, pages: Union[None, DictionaryObject, PageObject] = None, inherit: Optional[Dict[str, Any]] = None, indirect_reference: Optional[IndirectObject] = None, ) -> None: + """ + prepare the document pages to ease searching + args: + list_only: will only list the pages witin _flatten_pages + pages, + inherit, + indirect_reference: used recursively to flatten the /Pages object + """ inheritable_page_attributes = ( NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), @@ -1122,7 +1133,7 @@ def _flatten( if obj: # damaged file may have invalid child in /Pages try: - self._flatten(obj, inherit, **addt) + self._flatten(list_only, obj, inherit, **addt) except RecursionError: raise PdfReadError( "Maximum recursion depth reached during page flattening." @@ -1134,7 +1145,8 @@ def _flatten( if attr_in not in pages: pages[attr_in] = value page_obj = PageObject(self, indirect_reference) - page_obj.update(pages) + if not list_only: + page_obj.update(pages) # TODO: Could flattened_pages be None at this point? self.flattened_pages.append(page_obj) # type: ignore @@ -1158,7 +1170,7 @@ def remove_page( or destinations to reference a detached page. """ if self.flattened_pages is None: - self._flatten() + self._flatten(self._readonly) assert self.flattened_pages is not None if isinstance(page, IndirectObject): p = page.get_object() diff --git a/pypdf/_page.py b/pypdf/_page.py index c51aee1ab..8a8c47eec 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -493,6 +493,18 @@ def __init__( # below Union for mypy but actually Optional[List[str]] self.indirect_reference = indirect_reference + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + Note: this function is overloaded to return the same results + as a DictionaryObject + """ + return hash( + (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) + ) + def hash_value_data(self) -> bytes: data = super().hash_value_data() data += b"%d" % id(self) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index b5fa14879..431db1a11 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): _objects: List[Any] _id_translated: Dict[int, Dict[int, int]] + incremental: bool + _reader: Any # PdfReader + @abstractmethod def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: ... # pragma: no cover diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 1ffcd436d..cd6be5083 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -136,6 +136,7 @@ def __init__( with open(stream, "rb") as fh: stream = BytesIO(fh.read()) self._stream_opened = True + self._startxref: int = 0 self.read(stream) self.stream = stream @@ -560,6 +561,7 @@ def read(self, stream: StreamType) -> None: self._basic_validation(stream) self._find_eof_marker(stream) startxref = self._find_startxref_pos(stream) + self._startxref = startxref # check and eventually correct the startxref only in not strict xref_issue_nr = self._get_xref_issues(stream, startxref) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index a72e2a23d..e47679d45 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1,3 +1,6 @@ +# TODO : thing about pages to have a global soluce without rework; +# consider question about heritage of properties + # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # @@ -154,10 +157,35 @@ def __init__( self, fileobj: Union[None, PdfReader, StrByteType, Path] = "", clone_from: Union[None, PdfReader, StrByteType, Path] = None, + incremental: bool = False, ) -> None: - self._header = b"%PDF-1.3" + self.incremental = incremental + if self.incremental: + if isinstance(fileobj, (str, Path)): + with open(fileobj, "rb") as f: + fileobj = BytesIO(f.read(-1)) + if isinstance(fileobj, IO): + fileobj = BytesIO(fileobj.read(-1)) + if isinstance(fileobj, BytesIO): + fileobj = PdfReader(fileobj) + else: + raise PyPdfError("Invalid type for incremental mode") + self._reader = fileobj # prev content is in _reader.stream + self._header = fileobj.pdf_header.encode() + self._readonly = True # !!!TODO: to be analysed + else: + self._header = b"%PDF-1.3" + """ + The indirect objects in the PDF. + for the incremental it will be filled with None + in clone_reader_document_root + """ self._objects: List[Optional[PdfObject]] = [] - """The indirect objects in the PDF.""" + + """ + list of hashes after import; used to identify changes + """ + self._original_hash: List[int] = [] """Maps hash values of indirect objects to the list of IndirectObjects. This is used for compression. @@ -168,33 +196,7 @@ def __init__( dict[id(pdf)][(idnum, generation)] """ self._id_translated: Dict[int, Dict[int, int]] = {} - - # The root of our page tree node. - pages = DictionaryObject() - pages.update( - { - NameObject(PA.TYPE): NameObject("/Pages"), - NameObject(PA.COUNT): NumberObject(0), - NameObject(PA.KIDS): ArrayObject(), - } - ) - self._pages = self._add_object(pages) - self.flattened_pages = [] - - # info object - info = DictionaryObject() - info.update({NameObject("/Producer"): create_string_object("pypdf")}) - self._info_obj: PdfObject = self._add_object(info) - - # root object - self._root_object = DictionaryObject() - self._root_object.update( - { - NameObject(PA.TYPE): NameObject(CO.CATALOG), - NameObject(CO.PAGES): self._pages, - } - ) - self._root = self._add_object(self._root_object) + self._ID: Union[ArrayObject, None] = None def _get_clone_from( fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], @@ -227,14 +229,44 @@ def _get_clone_from( self.temp_fileobj = fileobj self.fileobj = "" self.with_as_usage = False + # The root of our page tree node. + pages = DictionaryObject() + pages.update( + { + NameObject(PA.TYPE): NameObject("/Pages"), + NameObject(PA.COUNT): NumberObject(0), + NameObject(PA.KIDS): ArrayObject(), + } + ) + self.flattened_pages = [] + self._encryption: Optional[Encryption] = None + self._encrypt_entry: Optional[DictionaryObject] = None + self._info_obj: PdfObject + if clone_from is not None: if not isinstance(clone_from, PdfReader): clone_from = PdfReader(clone_from) self.clone_document_from_reader(clone_from) - - self._encryption: Optional[Encryption] = None - self._encrypt_entry: Optional[DictionaryObject] = None - self._ID: Union[ArrayObject, None] = None + else: + self._pages = self._add_object(pages) + # root object + self._root_object = DictionaryObject() + self._root_object.update( + { + NameObject(PA.TYPE): NameObject(CO.CATALOG), + NameObject(CO.PAGES): self._pages, + } + ) + self._add_object(self._root_object) + # info object + info = DictionaryObject() + info.update({NameObject("/Producer"): create_string_object("pypdf")}) + self._info_obj = self._add_object(info) + if isinstance(self._ID, list): + if isinstance(self._ID[0], TextStringObject): + self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) + if isinstance(self._ID[1], TextStringObject): + self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes()) # for commonality @property @@ -1115,18 +1147,29 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: Args: reader: PdfReader from which the document root should be copied. """ - self._objects.clear() + if self.incremental: + self._objects = [None] * cast(int, reader.trailer["/Size"]) + else: + self._objects.clear() self._root_object = reader.root_object.clone(self) - self._root = self._root_object.indirect_reference # type: ignore[assignment] self._pages = self._root_object.raw_get("/Pages") + + assert len(self._objects) <= cast(int, reader.trailer["/Size"]) # for pytest + # must be done here before rewriting + if self.incremental: + self._original_hash = [ + (obj.hash_bin() if obj is not None else 0) for obj in self._objects + ] self._flatten() assert self.flattened_pages is not None for p in self.flattened_pages: - p[NameObject("/Parent")] = self._pages - self._objects[cast(IndirectObject, p.indirect_reference).idnum - 1] = p - cast(DictionaryObject, self._pages.get_object())[ - NameObject("/Kids") - ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) + self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p) + if not self.incremental: + p[NameObject("/Parent")] = self._pages + if not self.incremental: + cast(DictionaryObject, self._pages.get_object())[ + NameObject("/Kids") + ] = ArrayObject([p.indirect_reference for p in self.flattened_pages]) def clone_document_from_reader( self, @@ -1148,13 +1191,26 @@ def clone_document_from_reader( document. """ self.clone_reader_document_root(reader) - self._info_obj = self._add_object(DictionaryObject()) if TK.INFO in reader.trailer: - self._info = reader._info # actually copy fields + if self.incremental: + inf = reader._info + if inf is not None: + self._info_obj = cast( + IndirectObject, inf.clone(self).indirect_reference + ) + self._original_hash[ + cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 + ] = self._info_obj.hash_bin() + else: + self._info = reader._info # actually copy fields + + else: + self._info_obj = self._add_object(DictionaryObject()) try: self._ID = cast(ArrayObject, reader._ID).clone(self) except AttributeError: pass + if callable(after_page_append): for page in cast( ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] @@ -1257,9 +1313,17 @@ def write_stream(self, stream: StreamType) -> None: # self._root = self._add_object(self._root_object) # self._sweep_indirect_references(self._root) - object_positions, free_objects = self._write_pdf_structure(stream) - xref_location = self._write_xref_table(stream, object_positions, free_objects) - self._write_trailer(stream, xref_location) + if self.incremental: + self._reader.stream.seek(0) + stream.write(self._reader.stream.read(-1)) + xref_location = self._write_increment(stream) + self._write_trailer(stream, xref_location) + else: + object_positions, free_objects = self._write_pdf_structure(stream) + xref_location = self._write_xref_table( + stream, object_positions, free_objects + ) + self._write_trailer(stream, xref_location) def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: """ @@ -1291,6 +1355,75 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: return my_file, stream + def _list_objects_in_increment(self) -> List[IndirectObject]: + """ + For debug / analysis + Provides the list of new/modified objects that are to be written + """ + ## lst = [] + ## for i in range(len(self._objects)): + ## if (self._objects[i] is not None and + ## (i >= len(self._original_hash) + ## or cast(PdfObject,self._objects[i]).hash_bin() != self._original_hash[i] + ## )): + ## lst.append(self._objects[i].indirect_reference) + return [ + cast(IndirectObject, self._objects[i]).indirect_reference + for i in range(len(self._objects)) + if ( + self._objects[i] is not None + and ( + i >= len(self._original_hash) + or cast(PdfObject, self._objects[i]).hash_bin() + != self._original_hash[i] + ) + ) + ] + + def _write_increment(self, stream: StreamType) -> int: + object_positions = {} + object_blocks = [] + current_start = -1 + current_stop = -2 + for i, obj in enumerate(self._objects): + if self._objects[i] is not None and ( + i >= len(self._original_hash) + or cast(PdfObject, self._objects[i]).hash_bin() + != self._original_hash[i] + ): + idnum = i + 1 + assert isinstance(obj, PdfObject) # mypy + # first write new/modified object + object_positions[idnum] = stream.tell() + stream.write(f"{idnum} 0 obj\n".encode()) + if self._encryption and obj != self._encrypt_entry: + obj = self._encryption.encrypt_object(obj, idnum, 0) + obj.write_to_stream(stream) + stream.write(b"\nendobj\n") + + # prepare xref + if idnum != current_stop: + if current_start > 0: + object_blocks.append( + [current_start, current_stop - current_start] + ) + current_start = idnum + current_stop = idnum + 1 + else: + current_stop = idnum + 1 + if current_start > 0: + object_blocks.append([current_start, current_stop - current_start]) + # write incremented xref + xref_location = stream.tell() + stream.write(b"xref\n") + stream.write(b"0 1\n") + stream.write(b"0000000000 65535 f \n") + for block in object_blocks: + stream.write(f"{block[0]} {block[1]}\n".encode()) + for i in range(block[0], block[0] + block[1]): + stream.write(f"{object_positions[i]:0>10} {0:0>5} n \n".encode()) + return xref_location + def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: object_positions = [] free_objects = [] # will contain list of all free entries @@ -1337,14 +1470,15 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: of certain special objects within the body of the file. """ stream.write(b"trailer\n") - trailer = DictionaryObject() - trailer.update( + trailer = DictionaryObject( { NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), - NameObject(TK.ROOT): self._root, + NameObject(TK.ROOT): self.root_object.indirect_reference, NameObject(TK.INFO): self._info_obj, } ) + if self.incremental: + trailer[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) if self._ID: trailer[NameObject(TK.ID)] = self._ID if self._encrypt_entry: diff --git a/pypdf/constants.py b/pypdf/constants.py index 745774e2a..a7e67aacc 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -33,6 +33,7 @@ class TrailerKeys: ID = "/ID" INFO = "/Info" SIZE = "/Size" + PREV = "/Prev" class CatalogAttributes: @@ -209,7 +210,7 @@ class PagesAttributes: PARENT = "/Parent" # dictionary, required; indirect reference to pages object KIDS = "/Kids" # array, required; List of indirect references COUNT = "/Count" # integer, required; the number of leaf nodes (page objects) - # that are descendants of this node within the page tree + # that are descendants of this node within the page tree class PageAttributes: @@ -217,7 +218,9 @@ class PageAttributes: TYPE = "/Type" # name, required; must be /Page PARENT = "/Parent" # dictionary, required; a pages object - LAST_MODIFIED = "/LastModified" # date, optional; date and time of last modification + LAST_MODIFIED = ( + "/LastModified" # date, optional; date and time of last modification + ) RESOURCES = "/Resources" # dictionary, required if there are any MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size CROPBOX = "/CropBox" # rectangle, optional diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index f48dc66c3..9dfb25a29 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -53,6 +53,16 @@ class PdfObject(PdfObjectProtocol): hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 indirect_reference: Optional["IndirectObject"] + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not implement .hash_bin() so far" + ) + def hash_value_data(self) -> bytes: return ("%s" % self).encode() @@ -121,7 +131,15 @@ def _reference_clone( ind = self.indirect_reference except AttributeError: return clone - i = len(pdf_dest._objects) + 1 + if ( + pdf_dest.incremental + and ind is not None + and ind.pdf == pdf_dest._reader + and ind.idnum <= len(pdf_dest._objects) + ): + i = ind.idnum + else: + i = len(pdf_dest._objects) + 1 if ind is not None: if id(ind.pdf) not in pdf_dest._id_translated: pdf_dest._id_translated[id(ind.pdf)] = {} @@ -136,7 +154,11 @@ def _reference_clone( assert obj is not None return obj pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i - pdf_dest._objects.append(clone) + try: + pdf_dest._objects[i - 1] = clone + except IndexError: + pdf_dest._objects.append(clone) + i = len(pdf_dest._objects) clone.indirect_reference = IndirectObject(i, 0, pdf_dest) return clone @@ -162,6 +184,14 @@ def clone( "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__,)) + def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: @@ -198,6 +228,14 @@ def clone( self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, self.value)) + def __eq__(self, __o: object) -> bool: if isinstance(__o, BooleanObject): return self.value == __o.value @@ -242,6 +280,14 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader def __hash__(self) -> int: return hash((self.idnum, self.generation, id(self.pdf))) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) + def clone( self, pdf_dest: PdfWriterProtocol, @@ -400,6 +446,14 @@ def clone( self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, self.as_numeric)) + def myrepr(self) -> str: if self == 0: return "0.0" @@ -445,6 +499,14 @@ def clone( self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, self.as_numeric())) + def as_numeric(self) -> int: return int(repr(self).encode("utf8")) @@ -488,6 +550,14 @@ def clone( ), ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, bytes(self))) + @property def original_bytes(self) -> bytes: """For compatibility with TextStringObject.original_bytes.""" @@ -567,6 +637,14 @@ def clone( "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, self.original_bytes)) + @property def original_bytes(self) -> bytes: """ @@ -663,6 +741,14 @@ def clone( self._reference_clone(NameObject(self), pdf_dest, force_duplicate), ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, self)) + def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 399836be5..e53129a48 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -131,6 +131,14 @@ def clone( arr.append(data) return arr + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((self.__class__, tuple(x.hash_bin() for x in self))) + def items(self) -> Iterable[Any]: """Emulate DictionaryObject.items for a list (index, object).""" return enumerate(self) @@ -371,6 +379,16 @@ def _clone( else v ) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash( + (self.__class__, tuple(((k, v.hash_bin()) for k, v in self.items()))) + ) + def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) @@ -876,6 +894,14 @@ def _clone( pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) + def hash_bin(self) -> int: + """ + Returns: + hash considering type and value + used to detect modified object + """ + return hash((super().hash_bin(), self.get_data())) + def get_data(self) -> bytes: return self._data diff --git a/tests/test_reader.py b/tests/test_reader.py index 0413a9135..c1bdff944 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -212,7 +212,7 @@ def test_get_outline(src, outline_elements): pytest.param( "imagemagick-ASCII85Decode.pdf", ["Im0.png"], - marks=pytest.mark.xfail(reason="broken image extraction"), + # marks=pytest.mark.xfail(reason="broken image extraction"), ), ("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]), (SAMPLE_ROOT / "019-grayscale-image/grayscale-image.pdf", ["X0.png"]), From 0543709a702921f767ec04aaa9ea40db1b7272bc Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 24 Aug 2024 11:21:22 +0200 Subject: [PATCH 02/40] fix test --- pypdf/_writer.py | 52 ++++++++++-------- ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217093 -> 217093 bytes 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e47679d45..dd96251de 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -160,21 +160,6 @@ def __init__( incremental: bool = False, ) -> None: self.incremental = incremental - if self.incremental: - if isinstance(fileobj, (str, Path)): - with open(fileobj, "rb") as f: - fileobj = BytesIO(f.read(-1)) - if isinstance(fileobj, IO): - fileobj = BytesIO(fileobj.read(-1)) - if isinstance(fileobj, BytesIO): - fileobj = PdfReader(fileobj) - else: - raise PyPdfError("Invalid type for incremental mode") - self._reader = fileobj # prev content is in _reader.stream - self._header = fileobj.pdf_header.encode() - self._readonly = True # !!!TODO: to be analysed - else: - self._header = b"%PDF-1.3" """ The indirect objects in the PDF. for the incremental it will be filled with None @@ -197,6 +182,28 @@ def __init__( """ self._id_translated: Dict[int, Dict[int, int]] = {} self._ID: Union[ArrayObject, None] = None + self._info_obj: PdfObject + + if self.incremental: + if isinstance(fileobj, (str, Path)): + with open(fileobj, "rb") as f: + fileobj = BytesIO(f.read(-1)) + if isinstance(fileobj, IO): + fileobj = BytesIO(fileobj.read(-1)) + if isinstance(fileobj, BytesIO): + fileobj = PdfReader(fileobj) + else: + raise PyPdfError("Invalid type for incremental mode") + self._reader = fileobj # prev content is in _reader.stream + self._header = fileobj.pdf_header.encode() + self._readonly = True # !!!TODO: to be analysed + else: + self._header = b"%PDF-1.3" + self._info_obj = self._add_object( + DictionaryObject( + {NameObject("/Producer"): create_string_object("pypdf")} + ) + ) def _get_clone_from( fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO], @@ -241,7 +248,6 @@ def _get_clone_from( self.flattened_pages = [] self._encryption: Optional[Encryption] = None self._encrypt_entry: Optional[DictionaryObject] = None - self._info_obj: PdfObject if clone_from is not None: if not isinstance(clone_from, PdfReader): @@ -258,10 +264,6 @@ def _get_clone_from( } ) self._add_object(self._root_object) - # info object - info = DictionaryObject() - info.update({NameObject("/Producer"): create_string_object("pypdf")}) - self._info_obj = self._add_object(info) if isinstance(self._ID, list): if isinstance(self._ID[0], TextStringObject): self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes()) @@ -1192,8 +1194,8 @@ def clone_document_from_reader( """ self.clone_reader_document_root(reader) if TK.INFO in reader.trailer: + inf = reader._info if self.incremental: - inf = reader._info if inf is not None: self._info_obj = cast( IndirectObject, inf.clone(self).indirect_reference @@ -1201,11 +1203,13 @@ def clone_document_from_reader( self._original_hash[ cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 ] = self._info_obj.hash_bin() - else: - self._info = reader._info # actually copy fields - + elif inf is not None: + self._info_obj = self._add_object( + DictionaryObject(cast(DictionaryObject, inf.get_object())) + ) else: self._info_obj = self._add_object(DictionaryObject()) + try: self._ID = cast(ArrayObject, reader._ID).clone(self) except AttributeError: diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index a53f28f0be432c38a1fff33672a2170eeb5f553f..8a04001ddae371fa756d1dc2f607fd42965f0f8f 100644 GIT binary patch delta 94 zcmZo&z}vcjcY^f925Cm4i2^ATXDb5fy%!jbHuo{k?gCO;J%WZn%4YBN)r>~lMKT!= s&IQuf*qG*o1L@*8CW#mz-Ls2HXaci=smb<*Ud%46jE2*n`7_G{06VfE8vp Date: Sun, 25 Aug 2024 17:10:22 +0200 Subject: [PATCH 03/40] fixes + first test --- pypdf/_page.py | 2 ++ pypdf/_writer.py | 20 ++++++++--------- pypdf/generic/_data_structures.py | 3 ++- tests/test_writer.py | 36 +++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 8a8c47eec..79cdb7adf 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -492,6 +492,8 @@ def __init__( self.inline_images: Optional[Dict[str, ImageFile]] = None # below Union for mypy but actually Optional[List[str]] self.indirect_reference = indirect_reference + if indirect_reference is not None: + self.update(cast(DictionaryObject, indirect_reference.get_object())) def hash_bin(self) -> int: """ diff --git a/pypdf/_writer.py b/pypdf/_writer.py index dd96251de..24da87337 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1202,7 +1202,7 @@ def clone_document_from_reader( ) self._original_hash[ cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 - ] = self._info_obj.hash_bin() + ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin() elif inf is not None: self._info_obj = self._add_object( DictionaryObject(cast(DictionaryObject, inf.get_object())) @@ -1359,18 +1359,16 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: return my_file, stream - def _list_objects_in_increment(self) -> List[IndirectObject]: + def list_objects_in_increment(self) -> List[IndirectObject]: """ For debug / analysis - Provides the list of new/modified objects that are to be written - """ - ## lst = [] - ## for i in range(len(self._objects)): - ## if (self._objects[i] is not None and - ## (i >= len(self._original_hash) - ## or cast(PdfObject,self._objects[i]).hash_bin() != self._original_hash[i] - ## )): - ## lst.append(self._objects[i].indirect_reference) + Provides the list of new/modified objects that will be written + in the increment + Deleted Objects will not be freeed but will become orphans + + Returns: + List of (new / modified) IndirectObjects + """ return [ cast(IndirectObject, self._objects[i]).indirect_reference for i in range(len(self._objects)) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index e53129a48..00f4ceab8 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -900,7 +900,8 @@ def hash_bin(self) -> int: hash considering type and value used to detect modified object """ - return hash((super().hash_bin(), self.get_data())) + # use of _data to prevent errors on non decoded stream such as JBIG2 + return hash((super().hash_bin(), self._data)) def get_data(self) -> bytes: return self._data diff --git a/tests/test_writer.py b/tests/test_writer.py index b6a47a18c..3ac1f06da 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2354,3 +2354,39 @@ def test_utf16_metadata(): b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e" b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)" ) + + +def test_list_objects_in_increment(caplog): + """Tests for #2811""" + writer = PdfWriter( + RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf", + incremental=True, + ) + # Contains JBIG2 not decoded for the moment + assert writer.list_objects_in_increment() == [] # no flowdown of properties + # modify one object + writer.pages[0][NameObject("/MediaBox")] = ArrayObject( + [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] + ) + assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)] + b = BytesIO() + writer.write(b) + assert b.getvalue().startswith(writer._reader.stream.getvalue()) + b.seek(0) + reader = PdfReader(b) + assert reader.pages[0]["/MediaBox"] == ArrayObject( + [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] + ) + with pytest.raises(PyPdfError): + writer = PdfWriter(reader, incremental=True) + b.seek(0) + writer = PdfWriter(b, incremental=True) + assert writer.list_objects_in_increment() == [] # no flowdown of properties + + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) + # 1 object is modified: page 0 inherits MediaBox so is changed + assert len(writer.list_objects_in_increment()) == 1 + + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False) + # 1 object is modified: page 0 inherits MediaBox so is changed + assert len(writer.list_objects_in_increment()) == len(writer._objects) From 1067b744eeac6374344a8c63ddce742d87d49d91 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:06:16 +0200 Subject: [PATCH 04/40] coverage --- pypdf/_page.py | 2 +- pypdf/_writer.py | 2 -- tests/test_generic.py | 6 ++++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 79cdb7adf..c81eeb8cd 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -501,7 +501,7 @@ def hash_bin(self) -> int: hash considering type and value used to detect modified object Note: this function is overloaded to return the same results - as a DictionaryObject + as a DictionaryObject """ return hash( (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items()))) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 24da87337..e052b94ae 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -188,8 +188,6 @@ def __init__( if isinstance(fileobj, (str, Path)): with open(fileobj, "rb") as f: fileobj = BytesIO(f.read(-1)) - if isinstance(fileobj, IO): - fileobj = BytesIO(fileobj.read(-1)) if isinstance(fileobj, BytesIO): fileobj = PdfReader(fileobj) else: diff --git a/tests/test_generic.py b/tests/test_generic.py index 6b8ae0151..bc83ea4fe 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1472,3 +1472,9 @@ def test_unitary_extract_inline(): ec.set_data(b) co = ContentStream(ec, None) assert co.operations[7][0]["data"] == b"abcdefghijklmnop" + + +def test_missing_hashbin(): + assert NullObject().hash_bin() == hash((NullObject,)) + t = ByteStringObject(b"123") + assert t.hash_bin() == hash((ByteStringObject, b"123")) From f1d3fbe6367e0fcc1e2efc79c1932643851dd455 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:39:37 +0200 Subject: [PATCH 05/40] coverage --- pypdf/_page.py | 1 + pypdf/_writer.py | 2 ++ tests/test_writer.py | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index c81eeb8cd..aebe9ebbd 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -500,6 +500,7 @@ def hash_bin(self) -> int: Returns: hash considering type and value used to detect modified object + Note: this function is overloaded to return the same results as a DictionaryObject """ diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e052b94ae..a0d55e3c5 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1396,8 +1396,10 @@ def _write_increment(self, stream: StreamType) -> int: # first write new/modified object object_positions[idnum] = stream.tell() stream.write(f"{idnum} 0 obj\n".encode()) + """ encryption is not operational if self._encryption and obj != self._encrypt_entry: obj = self._encryption.encrypt_object(obj, idnum, 0) + """ obj.write_to_stream(stream) stream.write(b"\nendobj\n") diff --git a/tests/test_writer.py b/tests/test_writer.py index 3ac1f06da..1a172e8c3 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2369,6 +2369,10 @@ def test_list_objects_in_increment(caplog): [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)] + writer.pages[5][NameObject("/MediaBox")] = ArrayObject( + [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] + ) + assert len(writer.list_objects_in_increment()) == 2 b = BytesIO() writer.write(b) assert b.getvalue().startswith(writer._reader.stream.getvalue()) From ae97bc73b4f6b0b2653009b47b5b6ead47e13424 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:06:03 +0200 Subject: [PATCH 06/40] cope with multiple level pages --- pypdf/_doc_common.py | 41 ++++++++++++++++++++++++++++++++++++++--- pypdf/_page.py | 24 +++++++++++++++--------- pypdf/_writer.py | 43 ++++++++++++++++++++++++++++++++----------- tests/test_page.py | 4 +++- 4 files changed, 88 insertions(+), 24 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 12848fb8e..ea3c93aab 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -65,9 +65,7 @@ from .constants import FieldDictionaryAttributes as FA from .constants import PageAttributes as PG from .constants import PagesAttributes as PA -from .errors import ( - PdfReadError, -) +from .errors import PdfReadError, PyPdfError from .generic import ( ArrayObject, BooleanObject, @@ -372,6 +370,43 @@ def get_page(self, page_number: int) -> PageObject: assert self.flattened_pages is not None, "hint for mypy" return self.flattened_pages[page_number] + def _get_page_in_node( + self, + page_number: int, + ) -> Tuple[DictionaryObject, int]: + """ + Retrieve the node and position within the /Kids containing the page + if page_number is greater than the number of page, it returns top node, -1 + """ + top = cast(DictionaryObject, self.root_object["/Pages"]) + + def recurs(node: DictionaryObject, mi: int) -> Tuple[Optional[PdfObject], int]: + ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types + if node["/Type"] == "/Page": + if page_number == mi: + return node, -1 + # else: + return None, mi + 1 + if (page_number - mi) >= ma: # not in nodes below + if node == top: + return top, -1 + # else + return None, mi + ma + for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])): + kid = cast(DictionaryObject, kid.get_object()) + n, i = recurs(kid, mi) + if n is not None: # page has just been found ... + if i < 0: # ... just below! + return node, idx + # else: # ... at lower levels + return n, i + mi = i + raise PyPdfError("abnormal, can not find the node") + + node, idx = recurs(top, 0) + assert isinstance(node, DictionaryObject) + return node, idx + @property def named_destinations(self) -> Dict[str, Any]: """ diff --git a/pypdf/_page.py b/pypdf/_page.py index aebe9ebbd..b9f6e012b 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2414,27 +2414,33 @@ def __delitem__(self, index: Union[int, slice]) -> None: raise IndexError("index out of range") ind = self[index].indirect_reference assert ind is not None - parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None) + parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get( + "/Parent", None + ) + first = True while parent is not None: parent = cast(DictionaryObject, parent.get_object()) try: - i = parent["/Kids"].index(ind) - del parent["/Kids"][i] + i = cast(ArrayObject, parent["/Kids"]).index(ind) + del cast(ArrayObject, parent["/Kids"])[i] + first = False try: assert ind is not None del ind.pdf.flattened_pages[index] # case of page in a Reader except Exception: # pragma: no cover pass if "/Count" in parent: - parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1) - if len(parent["/Kids"]) == 0: + parent[NameObject("/Count")] = NumberObject( + cast(int, parent["/Count"]) - 1 + ) + if len(cast(ArrayObject, parent["/Kids"])) == 0: # No more objects in this part of this sub tree ind = parent.indirect_reference - parent = cast(DictionaryObject, parent.get("/Parent", None)) - else: - parent = None + parent = parent.get("/Parent", None) except ValueError: # from index - raise PdfReadError(f"Page Not Found in Page Tree {ind}") + if first: + raise PdfReadError(f"Page Not Found in Page Tree {ind}") + break def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): diff --git a/pypdf/_writer.py b/pypdf/_writer.py index a0d55e3c5..e2747c153 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -439,10 +439,12 @@ def _replace_object( def _add_page( self, page: PageObject, - action: Callable[[Any, Union[PageObject, IndirectObject]], None], + index: int, excluded_keys: Iterable[str] = (), ) -> PageObject: - assert cast(str, page[PA.TYPE]) == CO.PAGE + if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE: + raise ValueError("Invalid page Object") + assert self.flattened_pages is not None, "for mypy" page_org = page excluded_keys = list(excluded_keys) excluded_keys += [PA.PARENT, "/StructParents"] @@ -460,13 +462,23 @@ def _add_page( if page_org.pdf is not None: other = page_org.pdf.pdf_header self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) - page[NameObject(PA.PARENT)] = self._pages - pages = cast(DictionaryObject, self.get_object(self._pages)) - assert page.indirect_reference is not None - action(pages[PA.KIDS], page.indirect_reference) - action(self.flattened_pages, page) - page_count = cast(int, pages[PA.COUNT]) - pages[NameObject(PA.COUNT)] = NumberObject(page_count + 1) + node, idx = self._get_page_in_node(index) + page[NameObject(PA.PARENT)] = node.indirect_reference + if idx >= 0: # to be a + cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference) + if self.flattened_pages != node[PA.KIDS]: + self.flattened_pages.insert(index, page) + else: + cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) + if self.flattened_pages != node[PA.KIDS]: + self.flattened_pages.append(page) + cpt = 1000 + while node is not None: + node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) + node = node.get(PA.PARENT, None) + cpt -= 1 + if cpt < 0: + raise PyPdfError("Recursive Error detected") return page def set_need_appearances_writer(self, state: bool = True) -> None: @@ -529,7 +541,8 @@ def add_page( Returns: The added PageObject. """ - return self._add_page(page, list.append, excluded_keys) + assert self.flattened_pages is not None + return self._add_page(page, len(self.flattened_pages), excluded_keys) def insert_page( self, @@ -549,7 +562,15 @@ def insert_page( Returns: The added PageObject. """ - return self._add_page(page, lambda kids, p: kids.insert(index, p)) + assert self.flattened_pages is not None + if index < 0: + index = len(self.flattened_pages) + index + if index < 0: + raise ValueError("invalid index value") + if index >= len(self.flattened_pages): + return self.add_page(page, excluded_keys) + else: + return self._add_page(page, index, excluded_keys) def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] diff --git a/tests/test_page.py b/tests/test_page.py index 72df648e4..8bde3e82e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1251,7 +1251,9 @@ def test_del_pages(): del pp["/Parent"].get_object()["/Kids"][i] with pytest.raises(PdfReadError): del reader.pages[2] - # reader is corrupted we have to reload it + + url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf" + name = "iss2343.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) del reader.pages[:] assert len(reader.pages) == 0 From d9a99d9e4415a188b45dbf37e79925e9cac9193a Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:24:52 +0200 Subject: [PATCH 07/40] test + doc --- pypdf/_writer.py | 27 +++++++++++++++++++++------ tests/test_page.py | 5 +++-- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e2747c153..59d6b3822 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -151,6 +151,15 @@ class PdfWriter(PdfDocCommon): cloning a PDF file during initialization. Typically data is added from a :class:`PdfReader`. + + clone_from: identical to fileobj (for compatibility) + + incremental: `bool` + If true, loads the document and set the PdfWriter in incremental mode + + When writing the original document is written first and new/modified + are appened. to be used for signed document/forms to keep signature + valid. """ def __init__( @@ -161,26 +170,32 @@ def __init__( ) -> None: self.incremental = incremental """ + Returns if the PdfWriter object has been started in incremental mode + """ + + self._objects: List[Optional[PdfObject]] = [] + """ The indirect objects in the PDF. for the incremental it will be filled with None in clone_reader_document_root """ - self._objects: List[Optional[PdfObject]] = [] + self._original_hash: List[int] = [] """ list of hashes after import; used to identify changes """ - self._original_hash: List[int] = [] - """Maps hash values of indirect objects to the list of IndirectObjects. - This is used for compression. - """ self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} + """ + Maps hash values of indirect objects to the list of IndirectObjects. + This is used for compression. + """ + self._id_translated: Dict[int, Dict[int, int]] = {} """List of already translated IDs. dict[id(pdf)][(idnum, generation)] """ - self._id_translated: Dict[int, Dict[int, int]] = {} + self._ID: Union[ArrayObject, None] = None self._info_obj: PdfObject diff --git a/tests/test_page.py b/tests/test_page.py index 8bde3e82e..ac9d241a7 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1252,9 +1252,10 @@ def test_del_pages(): with pytest.raises(PdfReadError): del reader.pages[2] - url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf" - name = "iss2343.pdf" + url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" + name = "iss2343b.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + del reader.pages[4] # to propagate among /Pages del reader.pages[:] assert len(reader.pages) == 0 assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0 From 3c4cfdc2510587c8a75cbe6d6760362db44a2fa1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:30:20 +0200 Subject: [PATCH 08/40] coverage --- tests/test_page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_page.py b/tests/test_page.py index ac9d241a7..d9efd4992 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1254,11 +1254,11 @@ def test_del_pages(): url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" name = "iss2343b.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True) del reader.pages[4] # to propagate among /Pages del reader.pages[:] assert len(reader.pages) == 0 - assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0 + assert len(reader.root_object["/Pages"]["/Kids"]) == 0 assert len(reader.flattened_pages) == 0 From 38d4b351d81719ed774476f0cf7ee7187ff55a9e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:57:20 +0200 Subject: [PATCH 09/40] coverage --- pypdf/_writer.py | 11 +++-------- tests/test_page.py | 23 ++++++++++++++++------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 59d6b3822..b981cb0d5 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1,6 +1,3 @@ -# TODO : thing about pages to have a global soluce without rework; -# consider question about heritage of properties - # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # @@ -154,12 +151,10 @@ class PdfWriter(PdfDocCommon): clone_from: identical to fileobj (for compatibility) - incremental: `bool` - If true, loads the document and set the PdfWriter in incremental mode + incremental: If true, loads the document and set the PdfWriter in incremental mode - When writing the original document is written first and new/modified - are appened. to be used for signed document/forms to keep signature - valid. + When writing in incremental the original document is written first and new/modified + are appened. to be used for signed document/forms to keep signature valid. """ def __init__( diff --git a/tests/test_page.py b/tests/test_page.py index d9efd4992..dc3ec9c55 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -12,7 +12,7 @@ from pypdf import PdfReader, PdfWriter, Transformation from pypdf._page import PageObject from pypdf.constants import PageAttributes as PG -from pypdf.errors import PdfReadError, PdfReadWarning +from pypdf.errors import PdfReadError, PdfReadWarning, PyPdfError from pypdf.generic import ( ArrayObject, ContentStream, @@ -887,6 +887,8 @@ def test_annotation_setter(pdf_file_path): page = reader.pages[0] writer = PdfWriter() writer.add_page(page) + with pytest.raises(ValueError): + writer.add_page(DictionaryObject()) # Act page_number = 0 @@ -1254,12 +1256,19 @@ def test_del_pages(): url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" name = "iss2343b.pdf" - reader = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True) - del reader.pages[4] # to propagate among /Pages - del reader.pages[:] - assert len(reader.pages) == 0 - assert len(reader.root_object["/Pages"]["/Kids"]) == 0 - assert len(reader.flattened_pages) == 0 + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True) + node, idx = writer._get_page_in_node(53) + assert (node.indirect_reference.idnum, idx) == (11776, 1) + node, idx = writer._get_page_in_node(10000) + assert (node.indirect_reference.idnum, idx) == (11769, -1) + with pytest.raises(PyPdfError): + writer._get_page_in_node(-1) + + del writer.pages[4] # to propagate among /Pages + del writer.pages[:] + assert len(writer.pages) == 0 + assert len(writer.root_object["/Pages"]["/Kids"]) == 0 + assert len(writer.flattened_pages) == 0 def test_pdf_pages_missing_type(): From 79eca73b7774dadedac01c188681b4559e6cfcaf Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:38:02 +0200 Subject: [PATCH 10/40] coverage --- pypdf/_writer.py | 5 ++--- tests/test_page.py | 13 +++++++++++++ tests/test_writer.py | 4 +++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index b981cb0d5..b532b6446 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -484,6 +484,7 @@ def _add_page( self.flattened_pages.append(page) cpt = 1000 while node is not None: + node = cast(DictionaryObject, node.get_object()) node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) node = node.get(PA.PARENT, None) cpt -= 1 @@ -1441,9 +1442,7 @@ def _write_increment(self, stream: StreamType) -> int: [current_start, current_stop - current_start] ) current_start = idnum - current_stop = idnum + 1 - else: - current_stop = idnum + 1 + current_stop = idnum + 1 if current_start > 0: object_blocks.append([current_start, current_stop - current_start]) # write incremented xref diff --git a/tests/test_page.py b/tests/test_page.py index dc3ec9c55..39b1f4ec5 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1459,3 +1459,16 @@ def test_get_contents_as_bytes(): assert writer.pages[0]._get_contents_as_bytes() == expected writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0] assert writer.pages[0]._get_contents_as_bytes() == expected + + +def test_recursive_get_page_from_node(): + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) + writer.root_object["/Pages"].get_object()[ + NameObject("/Parent") + ] = writer.root_object["/Pages"].indirect_reference + with pytest.raises(PyPdfError): + writer.add_page(writer.pages[0]) + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) + writer.insert_page(writer.pages[0], -1) + with pytest.raises(ValueError): + writer.insert_page(writer.pages[0], -10) diff --git a/tests/test_writer.py b/tests/test_writer.py index 1a172e8c3..160ef4023 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2356,7 +2356,7 @@ def test_utf16_metadata(): ) -def test_list_objects_in_increment(caplog): +def test_increment_writer(caplog): """Tests for #2811""" writer = PdfWriter( RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf", @@ -2369,6 +2369,8 @@ def test_list_objects_in_increment(caplog): [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)] + b = BytesIO() + writer.write(b) writer.pages[5][NameObject("/MediaBox")] = ArrayObject( [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) From 290c5a6f423ab1af59431bdc76243c0b3a4a63c1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:52:07 +0200 Subject: [PATCH 11/40] coverage --- tests/test_writer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index 160ef4023..64d06d9b6 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1795,6 +1795,9 @@ def test_missing_info(): writer = PdfWriter(clone_from=reader) assert len(writer.pages) == len(reader.pages) + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + writer._info = reader._info + assert dict(writer._info) == dict(reader._info) @pytest.mark.enable_socket() From 173578d43011132197a44d5e16d225b5e7a9a3df Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:24:39 +0200 Subject: [PATCH 12/40] coverage --- pypdf/_writer.py | 4 +++- tests/test_writer.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index b532b6446..4850f251b 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -468,7 +468,9 @@ def _add_page( ] except Exception: pass - page = cast("PageObject", page_org.clone(self, False, excluded_keys)) + page = cast( + "PageObject", page_org.clone(self, False, excluded_keys).get_object() + ) if page_org.pdf is not None: other = page_org.pdf.pdf_header self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) diff --git a/tests/test_writer.py b/tests/test_writer.py index 64d06d9b6..25fb30623 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2399,3 +2399,22 @@ def test_increment_writer(caplog): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False) # 1 object is modified: page 0 inherits MediaBox so is changed assert len(writer.list_objects_in_increment()) == len(writer._objects) + + # insert pages in a tree + url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" + name = "iss2343b.pdf" + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True) + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + pg = writer.insert_page(reader.pages[0], 4) + assert ( + pg.raw_get("/Parent") + == writer.root_object["/Pages"]["/Kids"][0].get_object()["/Kids"][0] + ) + assert pg["/Parent"]["/Count"] == 8 + assert writer.root_object["/Pages"]["/Count"] == 285 + assert len(writer.flattened_pages) == 285 + + # clone without info + writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True) + assert len(writer.list_objects_in_increment()) == 1 + assert writer._info == {} From 1a6eda51cb215eefd18619d988facf8a84c5f2ae Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:16:49 +0200 Subject: [PATCH 13/40] simplification --- pypdf/_writer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 4850f251b..d400cf5f0 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -476,14 +476,13 @@ def _add_page( self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) node, idx = self._get_page_in_node(index) page[NameObject(PA.PARENT)] = node.indirect_reference + if idx >= 0: # to be a cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference) - if self.flattened_pages != node[PA.KIDS]: - self.flattened_pages.insert(index, page) + self.flattened_pages.insert(index, page) else: cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) - if self.flattened_pages != node[PA.KIDS]: - self.flattened_pages.append(page) + self.flattened_pages.append(page) cpt = 1000 while node is not None: node = cast(DictionaryObject, node.get_object()) From d43d25b6f6c4fdd09424ccb369e14177175921c8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 27 Aug 2024 09:39:21 +0200 Subject: [PATCH 14/40] coverage --- tests/test_writer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index 25fb30623..794dd0469 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2367,6 +2367,16 @@ def test_increment_writer(caplog): ) # Contains JBIG2 not decoded for the moment assert writer.list_objects_in_increment() == [] # no flowdown of properties + + # test writing with empty increment + b = BytesIO() + writer.write(b) + b.seek(0) + writer2 = PdfWriter(b, incremental=True) + assert len([x for x in writer2._objects if x is not None]) == len( + [x for x in writer._objects if x is not None] + ) + # modify one object writer.pages[0][NameObject("/MediaBox")] = ArrayObject( [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] @@ -2378,6 +2388,9 @@ def test_increment_writer(caplog): [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) assert len(writer.list_objects_in_increment()) == 2 + # modify object IndirectObject(5,0) : for coverage + writer.get_object(5)[NameObject("/ForTestOnly")] = NameObject("/ForTestOnly") + b = BytesIO() writer.write(b) assert b.getvalue().startswith(writer._reader.stream.getvalue()) @@ -2386,6 +2399,7 @@ def test_increment_writer(caplog): assert reader.pages[0]["/MediaBox"] == ArrayObject( [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)] ) + assert "/ForTestOnly" in reader.get_object(5) with pytest.raises(PyPdfError): writer = PdfWriter(reader, incremental=True) b.seek(0) From c9a6c95e06fcd090155f98cba4f90164fb30da9c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 28 Aug 2024 11:59:32 +0200 Subject: [PATCH 15/40] ENH: add capability to remove /Info from pypdf to be merged after #2811 --- docs/user/metadata.md | 24 ++++++++++++ pypdf/_writer.py | 88 ++++++++++++++++++++++++++++++++----------- tests/test_writer.py | 33 +++++++++++++++- 3 files changed, 120 insertions(+), 25 deletions(-) diff --git a/docs/user/metadata.md b/docs/user/metadata.md index 7f0a57694..a2bbdf9f0 100644 --- a/docs/user/metadata.md +++ b/docs/user/metadata.md @@ -76,6 +76,30 @@ writer.add_metadata( } ) +# Clear all data but keep the entry in PDF +writer.metadata = {} + +# Replace all entries with new set of entries +writer.metadata = { + "/Author": "Martin", + "/Producer": "Libre Writer", +} + +# Save the new PDF to a file +with open("meta-pdf.pdf", "wb") as f: + writer.write(f) +``` + +## Removing metadata entry + +```python +from pypdf import PdfWriter + +writer = PdfWriter("example.pdf") + +# Remove Metadata (/Info entry) +writer.metadata = None + # Save the new PDF to a file with open("meta-pdf.pdf", "wb") as f: writer.write(f) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index d400cf5f0..ca3ab9030 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -52,7 +52,7 @@ ) from ._cmap import _default_fonts_space_width, build_char_map_from_dict -from ._doc_common import PdfDocCommon +from ._doc_common import DocumentInformation, PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject from ._page_labels import nums_clear_range, nums_insert, nums_next @@ -192,7 +192,7 @@ def __init__( """ self._ID: Union[ArrayObject, None] = None - self._info_obj: PdfObject + self._info_obj: Optional[PdfObject] if self.incremental: if isinstance(fileobj, (str, Path)): @@ -307,13 +307,26 @@ def _info(self) -> Optional[DictionaryObject]: Returns: /Info Dictionary; None if the entry does not exist """ - return cast(DictionaryObject, self._info_obj.get_object()) + return ( + None + if self._info_obj is None + else cast(DictionaryObject, self._info_obj.get_object()) + ) @_info.setter - def _info(self, value: Union[IndirectObject, DictionaryObject]) -> None: - obj = cast(DictionaryObject, self._info_obj.get_object()) - obj.clear() - obj.update(cast(DictionaryObject, value.get_object())) + def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: + if value is None: + try: + self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore + except (KeyError, AttributeError): + pass + self._info_obj = None + else: + if self._info_obj is None: + self._info_obj = self._add_object(DictionaryObject()) + obj = cast(DictionaryObject, self._info_obj.get_object()) + obj.clear() + obj.update(cast(DictionaryObject, value.get_object())) @property def xmp_metadata(self) -> Optional[XmpInformation]: @@ -1184,6 +1197,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: self._objects = [None] * cast(int, reader.trailer["/Size"]) else: self._objects.clear() + self._info_obj = None self._root_object = reader.root_object.clone(self) self._pages = self._root_object.raw_get("/Pages") @@ -1224,22 +1238,21 @@ def clone_document_from_reader( document. """ self.clone_reader_document_root(reader) - if TK.INFO in reader.trailer: - inf = reader._info - if self.incremental: - if inf is not None: - self._info_obj = cast( - IndirectObject, inf.clone(self).indirect_reference - ) - self._original_hash[ - cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 - ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin() - elif inf is not None: - self._info_obj = self._add_object( - DictionaryObject(cast(DictionaryObject, inf.get_object())) + inf = reader._info + if self.incremental: + if inf is not None: + self._info_obj = cast( + IndirectObject, inf.clone(self).indirect_reference ) - else: - self._info_obj = self._add_object(DictionaryObject()) + assert isinstance(self._info, DictionaryObject), "for mypy" + self._original_hash[ + self._info_obj.indirect_reference.idnum - 1 + ] = self._info.hash_bin() + elif inf is not None: + self._info_obj = self._add_object( + DictionaryObject(cast(DictionaryObject, inf.get_object())) + ) + # else: _info_obj = None done in clone_reader_document_root() try: self._ID = cast(ArrayObject, reader._ID).clone(self) @@ -1507,9 +1520,10 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: { NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), NameObject(TK.ROOT): self.root_object.indirect_reference, - NameObject(TK.INFO): self._info_obj, } ) + if self._info is not None: + trailer[NameObject(TK.INFO)] = self._info.indirect_reference if self.incremental: trailer[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) if self._ID: @@ -1519,6 +1533,34 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: trailer.write_to_stream(stream) stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof + @property + def metadata(self) -> Optional[DocumentInformation]: + """ + Retrieve/set the PDF file's document information dictionary, if it exists. + + Args: + value: dict with the entries to be set. if None : remove the /Info entry from the pdf. + + Note that some PDF files use (xmp)metadata streams instead of document + information dictionaries, and these metadata streams will not be + accessed by this function. + """ + return super().metadata + + @metadata.setter + def metadata( + self, + value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]], + ) -> None: + if value is None: + self._info = None + else: + if self._info is not None: + self._info.clear() + else: + self._info = DictionaryObject() + self.add_metadata(value) + def add_metadata(self, infos: Dict[str, Any]) -> None: """ Add custom metadata to the output. diff --git a/tests/test_writer.py b/tests/test_writer.py index 794dd0469..21e1e5538 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1795,9 +1795,32 @@ def test_missing_info(): writer = PdfWriter(clone_from=reader) assert len(writer.pages) == len(reader.pages) + assert writer.metadata is None + b = BytesIO() + writer.write(b) + assert b"/Info" not in b.getvalue() + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - writer._info = reader._info + writer.metadata = reader.metadata assert dict(writer._info) == dict(reader._info) + assert writer.metadata == reader.metadata + b = BytesIO() + writer.write(b) + assert b"/Info" in b.getvalue() + + writer.metadata = {} + b = BytesIO() + writer.write(b) + assert b"/Info" in b.getvalue() + assert writer.metadata == {} + + writer.metadata = None + writer.metadata = None # for code checking + assert writer.metadata is None + assert PdfWriter().metadata == {"/Producer": "pypdf"} + b = BytesIO() + writer.write(b) + assert b"/Info" not in b.getvalue() @pytest.mark.enable_socket() @@ -2430,5 +2453,11 @@ def test_increment_writer(caplog): # clone without info writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True) + assert len(writer.list_objects_in_increment()) == 0 + assert writer.metadata is None + writer.metadata = {} + assert writer.metadata == {} assert len(writer.list_objects_in_increment()) == 1 - assert writer._info == {} + writer.metadata = None + assert len(writer.list_objects_in_increment()) == 0 + assert writer.metadata is None From 5147266a2948b9b91decee9f4924be48bd102d32 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 28 Aug 2024 12:19:16 +0200 Subject: [PATCH 16/40] coverage --- tests/test_writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 21e1e5538..28782fbcd 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1809,13 +1809,14 @@ def test_missing_info(): assert b"/Info" in b.getvalue() writer.metadata = {} + writer._info = {} # for code coverage b = BytesIO() writer.write(b) assert b"/Info" in b.getvalue() assert writer.metadata == {} writer.metadata = None - writer.metadata = None # for code checking + writer.metadata = None # for code coverage assert writer.metadata is None assert PdfWriter().metadata == {"/Producer": "pypdf"} b = BytesIO() From ec9aafe247b2d383d978882716f79fbf868883c3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 28 Aug 2024 12:43:41 +0200 Subject: [PATCH 17/40] oups --- tests/test_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 28782fbcd..b5dcd3357 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1809,7 +1809,7 @@ def test_missing_info(): assert b"/Info" in b.getvalue() writer.metadata = {} - writer._info = {} # for code coverage + writer._info = DictionaryObject() # for code coverage b = BytesIO() writer.write(b) assert b"/Info" in b.getvalue() From 14a93f1718b40beafea976e77ca9f2e71f2a1c4b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:28:41 +0200 Subject: [PATCH 18/40] move to X-reference stream for increment this prevents "repairation" within acrobat --- pypdf/_writer.py | 57 +++++++++++++++++++++++-------- pypdf/generic/_data_structures.py | 3 +- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index d400cf5f0..86aa120c0 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -31,6 +31,7 @@ import enum import hashlib import re +import struct import uuid from io import BytesIO, FileIO, IOBase from itertools import compress @@ -1351,8 +1352,8 @@ def write_stream(self, stream: StreamType) -> None: if self.incremental: self._reader.stream.seek(0) stream.write(self._reader.stream.read(-1)) - xref_location = self._write_increment(stream) - self._write_trailer(stream, xref_location) + if len(self.list_objects_in_increment()) > 0: + self._write_increment(stream) # writes objs, Xref stream and startx else: object_positions, free_objects = self._write_pdf_structure(stream) xref_location = self._write_xref_table( @@ -1413,7 +1414,7 @@ def list_objects_in_increment(self) -> List[IndirectObject]: ) ] - def _write_increment(self, stream: StreamType) -> int: + def _write_increment(self, stream: StreamType) -> None: object_positions = {} object_blocks = [] current_start = -1 @@ -1448,14 +1449,41 @@ def _write_increment(self, stream: StreamType) -> int: object_blocks.append([current_start, current_stop - current_start]) # write incremented xref xref_location = stream.tell() - stream.write(b"xref\n") - stream.write(b"0 1\n") - stream.write(b"0000000000 65535 f \n") - for block in object_blocks: - stream.write(f"{block[0]} {block[1]}\n".encode()) - for i in range(block[0], block[0] + block[1]): - stream.write(f"{object_positions[i]:0>10} {0:0>5} n \n".encode()) - return xref_location + xr_id = len(self._objects) + 1 + stream.write(f"{xr_id} 0 obj".encode()) + init_data = { + NameObject("/Type"): NameObject("/XRef"), + NameObject("/Size"): NumberObject(xr_id + 1), + NameObject("/Root"): self.root_object.indirect_reference, + NameObject("/Filter"): NameObject("/FlateDecode"), + NameObject("/Index"): ArrayObject( + [NumberObject(_it) for _su in object_blocks for _it in _su] + ), + NameObject("/W"): ArrayObject( + [NumberObject(1), NumberObject(4), NumberObject(1)] + ), + "__streamdata__": b"", + } + if self._info is not None and ( + not self.incremental + or self._info.hash_bin() # kept for future + != self._original_hash[ + cast(IndirectObject, self._info.indirect_reference).idnum - 1 + ] + ): + init_data[NameObject(TK.INFO)] = self._info.indirect_reference + if self.incremental: # kept for future + init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) + elif self._ID: + init_data[NameObject(TK.ID)] = self._ID + xr = StreamObject.initialize_from_dictionary(init_data) + xr.set_data( + b"".join( + [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()] + ) + ) + xr.write_to_stream(stream) + stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]: object_positions = [] @@ -1507,12 +1535,11 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: { NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), NameObject(TK.ROOT): self.root_object.indirect_reference, - NameObject(TK.INFO): self._info_obj, } ) - if self.incremental: - trailer[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) - if self._ID: + if self._info is not None: + trailer[NameObject(TK.INFO)] = self._info.indirect_reference + if self._ID is not None: trailer[NameObject(TK.ID)] = self._ID if self._encrypt_entry: trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index d048da8cb..fc71bf5bf 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -948,7 +948,8 @@ def initialize_from_dictionary( retval = DecodedStreamObject() retval._data = data["__streamdata__"] del data["__streamdata__"] - del data[SA.LENGTH] + if SA.LENGTH in data: + del data[SA.LENGTH] retval.update(data) return retval From 53e141fe12f05b633f0289bbb5d3ad35d51a3e13 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:54:09 +0200 Subject: [PATCH 19/40] coverage --- pypdf/_writer.py | 11 +++++------ tests/test_writer.py | 8 ++++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 86aa120c0..ad48882dc 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1464,17 +1464,16 @@ def _write_increment(self, stream: StreamType) -> None: ), "__streamdata__": b"", } - if self._info is not None and ( - not self.incremental - or self._info.hash_bin() # kept for future + if ( + self._info is not None + and self._info.hash_bin() # kept for future != self._original_hash[ cast(IndirectObject, self._info.indirect_reference).idnum - 1 ] ): init_data[NameObject(TK.INFO)] = self._info.indirect_reference - if self.incremental: # kept for future - init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) - elif self._ID: + init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref) + if self._ID: init_data[NameObject(TK.ID)] = self._ID xr = StreamObject.initialize_from_dictionary(init_data) xr.set_data( diff --git a/tests/test_writer.py b/tests/test_writer.py index 794dd0469..6cedc9443 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2371,11 +2371,19 @@ def test_increment_writer(caplog): # test writing with empty increment b = BytesIO() writer.write(b) + with open( + RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf", "rb" + ) as f: + assert b.getvalue() == f.read(-1) b.seek(0) writer2 = PdfWriter(b, incremental=True) assert len([x for x in writer2._objects if x is not None]) == len( [x for x in writer._objects if x is not None] ) + writer2.add_metadata({"/Author": "test"}) + assert len(writer2.list_objects_in_increment()) == 1 + b = BytesIO() + writer2.write(b) # modify one object writer.pages[0][NameObject("/MediaBox")] = ArrayObject( From b4b7c1bf96cd468fdb7687f391c0238cfc38ad57 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 16:09:25 +0200 Subject: [PATCH 20/40] coverage --- tests/test_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index 6cedc9443..7b9cbf003 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2440,3 +2440,5 @@ def test_increment_writer(caplog): writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True) assert len(writer.list_objects_in_increment()) == 1 assert writer._info == {} + b = BytesIO() + writer.write(b) From 7bc3abddae4fa04f4e8d416bb4280c1d0444bc38 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 16:12:13 +0200 Subject: [PATCH 21/40] coverage --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index ad48882dc..74c066e50 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1445,8 +1445,8 @@ def _write_increment(self, stream: StreamType) -> None: ) current_start = idnum current_stop = idnum + 1 - if current_start > 0: - object_blocks.append([current_start, current_stop - current_start]) + assert current_start > 0, "for pytest only" + object_blocks.append([current_start, current_stop - current_start]) # write incremented xref xref_location = stream.tell() xr_id = len(self._objects) + 1 From ffa2f0c5506a0aeae6139f606006df85cf05c421 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 16:55:09 +0200 Subject: [PATCH 22/40] fix --- pypdf/_writer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 74c066e50..409244727 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1464,11 +1464,16 @@ def _write_increment(self, stream: StreamType) -> None: ), "__streamdata__": b"", } - if ( - self._info is not None - and self._info.hash_bin() # kept for future + # below just to trick mypy for code simplification : will be reworked in next PR + assert isinstance( + cast(IndirectObject, self._info).indirect_reference, IndirectObject + ), "for mypy" + if self._info is not None and ( + cast(IndirectObject, self._info).indirect_reference.idnum - 1 + >= len(self._original_hash) + or cast(IndirectObject, self._info).hash_bin() # kept for future != self._original_hash[ - cast(IndirectObject, self._info.indirect_reference).idnum - 1 + cast(IndirectObject, self._info).indirect_reference.idnum - 1 ] ): init_data[NameObject(TK.INFO)] = self._info.indirect_reference From b072952b9c101a3530d07b1d4c1c975f1153352f Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 16:57:48 +0200 Subject: [PATCH 23/40] mypy --- pypdf/_writer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 409244727..886fcbca7 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1464,16 +1464,12 @@ def _write_increment(self, stream: StreamType) -> None: ), "__streamdata__": b"", } - # below just to trick mypy for code simplification : will be reworked in next PR - assert isinstance( - cast(IndirectObject, self._info).indirect_reference, IndirectObject - ), "for mypy" if self._info is not None and ( - cast(IndirectObject, self._info).indirect_reference.idnum - 1 + self._info.indirect_reference.idnum - 1 # type: ignore >= len(self._original_hash) or cast(IndirectObject, self._info).hash_bin() # kept for future != self._original_hash[ - cast(IndirectObject, self._info).indirect_reference.idnum - 1 + self._info.indirect_reference.idnum - 1 # type: ignore ] ): init_data[NameObject(TK.INFO)] = self._info.indirect_reference From 454c4fe026d2f076adb985bb46307cd2661b082e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Sep 2024 17:30:44 +0200 Subject: [PATCH 24/40] coverage --- tests/test_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index 0d8e61554..e06db389b 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2441,6 +2441,8 @@ def test_increment_writer(caplog): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) # 1 object is modified: page 0 inherits MediaBox so is changed assert len(writer.list_objects_in_increment()) == 1 + b = BytesIO() + writer.write(b) writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False) # 1 object is modified: page 0 inherits MediaBox so is changed From 494e00ae27f6b71ee503517b5bb48809866a57e2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:18:08 +0200 Subject: [PATCH 25/40] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index ea3c93aab..fcbc9904f 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -375,7 +375,7 @@ def _get_page_in_node( page_number: int, ) -> Tuple[DictionaryObject, int]: """ - Retrieve the node and position within the /Kids containing the page + Retrieve the node and position within the /Kids containing the page. if page_number is greater than the number of page, it returns top node, -1 """ top = cast(DictionaryObject, self.root_object["/Pages"]) From eba1c9f4639b694e8e20d08d0478b53520e57d05 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:18:24 +0200 Subject: [PATCH 26/40] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index fcbc9904f..aecef700f 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -385,7 +385,7 @@ def recurs(node: DictionaryObject, mi: int) -> Tuple[Optional[PdfObject], int]: if node["/Type"] == "/Page": if page_number == mi: return node, -1 - # else: + # else return None, mi + 1 if (page_number - mi) >= ma: # not in nodes below if node == top: From d68db51fe5342b7564e8df95a429c9c2927ea522 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:18:39 +0200 Subject: [PATCH 27/40] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index aecef700f..edaae356c 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -376,7 +376,7 @@ def _get_page_in_node( ) -> Tuple[DictionaryObject, int]: """ Retrieve the node and position within the /Kids containing the page. - if page_number is greater than the number of page, it returns top node, -1 + If page_number is greater than the number of pages, it returns the top node, -1 """ top = cast(DictionaryObject, self.root_object["/Pages"]) From 8b3182dcb7e04fca88a244f6e021b39579cfa150 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:24:41 +0200 Subject: [PATCH 28/40] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index edaae356c..2abcc52fe 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -401,7 +401,7 @@ def recurs(node: DictionaryObject, mi: int) -> Tuple[Optional[PdfObject], int]: # else: # ... at lower levels return n, i mi = i - raise PyPdfError("abnormal, can not find the node") + raise PyPdfError("Unexpectedly cannot find the node.") node, idx = recurs(top, 0) assert isinstance(node, DictionaryObject) From fe6aac7201c2b08f2c9c6bdc413efd427cfab49c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:24:57 +0200 Subject: [PATCH 29/40] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 886fcbca7..f214bac75 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1393,7 +1393,7 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: def list_objects_in_increment(self) -> List[IndirectObject]: """ - For debug / analysis + For debugging/analysis. Provides the list of new/modified objects that will be written in the increment Deleted Objects will not be freeed but will become orphans From 0be4bb4469db39f34d9ac0dd6a6eaa3ff9762338 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:25:09 +0200 Subject: [PATCH 30/40] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index f214bac75..faa56aa69 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1395,7 +1395,7 @@ def list_objects_in_increment(self) -> List[IndirectObject]: """ For debugging/analysis. Provides the list of new/modified objects that will be written - in the increment + in the increment. Deleted Objects will not be freeed but will become orphans Returns: From 4c585c0244a8ae8afbdfed807cc4a8ca12e04e14 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:28:27 +0200 Subject: [PATCH 31/40] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index faa56aa69..19e643503 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -579,7 +579,7 @@ def insert_page( if index < 0: index = len(self.flattened_pages) + index if index < 0: - raise ValueError("invalid index value") + raise ValueError("Invalid index value") if index >= len(self.flattened_pages): return self.add_page(page, excluded_keys) else: From fbe54d0f0c4316911fbd60450c49ca405bb84243 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:37:06 +0200 Subject: [PATCH 32/40] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 19e643503..e8af7fcc7 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -154,7 +154,7 @@ class PdfWriter(PdfDocCommon): incremental: If true, loads the document and set the PdfWriter in incremental mode - When writing in incremental the original document is written first and new/modified + When writing incrementally, the original document is written first and new/modified are appened. to be used for signed document/forms to keep signature valid. """ From e3c1e2c670b0d4f9e9ecead768164d69ae2b4630 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:37:31 +0200 Subject: [PATCH 33/40] Update pypdf/_writer.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e8af7fcc7..9d828baad 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -491,7 +491,7 @@ def _add_page( node = node.get(PA.PARENT, None) cpt -= 1 if cpt < 0: - raise PyPdfError("Recursive Error detected") + raise PyPdfError("Too many recursive calls!") return page def set_need_appearances_writer(self, state: bool = True) -> None: From 6e659431392c1ecf2d3b563f23f67294c0f46ada Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 09:43:19 +0200 Subject: [PATCH 34/40] clarify assert mypy --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 9d828baad..ef4bbfbfd 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -554,7 +554,7 @@ def add_page( Returns: The added PageObject. """ - assert self.flattened_pages is not None + assert self.flattened_pages is not None, "mypy" return self._add_page(page, len(self.flattened_pages), excluded_keys) def insert_page( @@ -575,7 +575,7 @@ def insert_page( Returns: The added PageObject. """ - assert self.flattened_pages is not None + assert self.flattened_pages is not None, "mypy" if index < 0: index = len(self.flattened_pages) + index if index < 0: From 412167298ebe5e8c55996f0b3ee24bcc6e1b8838 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 10:24:26 +0200 Subject: [PATCH 35/40] doc hash_bin --- pypdf/generic/_base.py | 45 +++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 9dfb25a29..d02a79810 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -55,9 +55,10 @@ class PdfObject(PdfObjectProtocol): def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ raise NotImplementedError( f"{self.__class__.__name__} does not implement .hash_bin() so far" @@ -186,9 +187,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__,)) @@ -230,9 +232,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, self.value)) @@ -282,9 +285,10 @@ def __hash__(self) -> int: def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, self.idnum, self.generation, id(self.pdf))) @@ -448,9 +452,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, self.as_numeric)) @@ -501,9 +506,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, self.as_numeric())) @@ -552,9 +558,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, bytes(self))) @@ -639,9 +646,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, self.original_bytes)) @@ -743,9 +751,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, self)) From bcc5c1da7ecede2c9b7d07de545827bf57823107 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Sep 2024 10:25:57 +0200 Subject: [PATCH 36/40] doc hash_bin --- pypdf/generic/_data_structures.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index fc71bf5bf..08bc2806d 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -133,9 +133,10 @@ def clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash((self.__class__, tuple(x.hash_bin() for x in self))) @@ -381,9 +382,10 @@ def _clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ return hash( (self.__class__, tuple(((k, v.hash_bin()) for k, v in self.items()))) @@ -896,9 +898,10 @@ def _clone( def hash_bin(self) -> int: """ + Used to detect modified object. + Returns: - hash considering type and value - used to detect modified object + Hash considering type and value. """ # use of _data to prevent errors on non decoded stream such as JBIG2 return hash((super().hash_bin(), self._data)) From bc6cabab1045b462a735a7124b504452ca737fd5 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sun, 8 Sep 2024 16:39:14 +0200 Subject: [PATCH 37/40] Update pypdf/_page.py --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 3888dcf6c..88943c3de 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2439,7 +2439,7 @@ def __delitem__(self, index: Union[int, slice]) -> None: parent = parent.get("/Parent", None) except ValueError: # from index if first: - raise PdfReadError(f"Page Not Found in Page Tree {ind}") + raise PdfReadError(f"Page not found in page tree: {ind}") break def __iter__(self) -> Iterator[PageObject]: From 8659de278b987fad07c3358c96a2a881c4d4949d Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sun, 8 Sep 2024 16:39:54 +0200 Subject: [PATCH 38/40] Update pypdf/_writer.py --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index ef4bbfbfd..e0a680972 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -155,7 +155,7 @@ class PdfWriter(PdfDocCommon): incremental: If true, loads the document and set the PdfWriter in incremental mode When writing incrementally, the original document is written first and new/modified - are appened. to be used for signed document/forms to keep signature valid. + content is appended. To be used for signed document/forms to keep signature valid. """ def __init__( From 99e6dfc93abdf931fc89485ffa5ddf7a49a7010d Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sun, 8 Sep 2024 16:42:51 +0200 Subject: [PATCH 39/40] Apply suggestions from code review --- pypdf/_writer.py | 10 +++++----- pypdf/constants.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e0a680972..8d6d9f390 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -172,13 +172,13 @@ def __init__( self._objects: List[Optional[PdfObject]] = [] """ The indirect objects in the PDF. - for the incremental it will be filled with None - in clone_reader_document_root + For the incremental case, it will be filled with None + in clone_reader_document_root. """ self._original_hash: List[int] = [] """ - list of hashes after import; used to identify changes + List of hashes after import; used to identify changes. """ self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {} @@ -454,7 +454,7 @@ def _add_page( excluded_keys: Iterable[str] = (), ) -> PageObject: if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE: - raise ValueError("Invalid page Object") + raise ValueError("Invalid page object") assert self.flattened_pages is not None, "for mypy" page_org = page excluded_keys = list(excluded_keys) @@ -1396,7 +1396,7 @@ def list_objects_in_increment(self) -> List[IndirectObject]: For debugging/analysis. Provides the list of new/modified objects that will be written in the increment. - Deleted Objects will not be freeed but will become orphans + Deleted objects will not be freed but will become orphans. Returns: List of (new / modified) IndirectObjects diff --git a/pypdf/constants.py b/pypdf/constants.py index a7e67aacc..d7a8e310f 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -210,7 +210,7 @@ class PagesAttributes: PARENT = "/Parent" # dictionary, required; indirect reference to pages object KIDS = "/Kids" # array, required; List of indirect references COUNT = "/Count" # integer, required; the number of leaf nodes (page objects) - # that are descendants of this node within the page tree + # that are descendants of this node within the page tree class PageAttributes: From be488722c96e4da09a254401686fdfbe54bd33e6 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sat, 14 Sep 2024 13:17:21 +0200 Subject: [PATCH 40/40] improve docs --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 73fcbee15..edcd391e4 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1566,9 +1566,9 @@ def metadata(self) -> Optional[DocumentInformation]: Retrieve/set the PDF file's document information dictionary, if it exists. Args: - value: dict with the entries to be set. if None : remove the /Info entry from the pdf. + value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF. - Note that some PDF files use (xmp)metadata streams instead of document + Note that some PDF files use (XMP) metadata streams instead of document information dictionaries, and these metadata streams will not be accessed by this function. """