diff --git a/pypdf/_page.py b/pypdf/_page.py index 81bdc69a8..55521e95b 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -920,6 +920,58 @@ def getContents(self) -> Optional[ContentStream]: # deprecated deprecation_with_replacement("getContents", "get_contents", "3.0.0") return self.get_contents() + def replace_contents( + self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] + ) -> None: + """ + Replace the page contents with the new content and nullify old objects + Args: + content : new content. if None delete the content field. + """ + if not hasattr(self, "indirect_reference") or self.indirect_reference is None: + # the page is not attached : the content is directly attached. + self[NameObject(PG.CONTENTS)] = content + return + if isinstance(self.get(PG.CONTENTS, None), ArrayObject): + for o in self[PG.CONTENTS]: # type: ignore[attr-defined] + try: + self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore + except AttributeError: + pass + if content is None: + if PG.CONTENTS not in self: + return + else: + assert self.indirect_reference is not None + assert self[PG.CONTENTS].indirect_reference is not None + self.indirect_reference.pdf._objects[ + self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore + ] = NullObject() + del self[PG.CONTENTS] + elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): + try: + self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( + content + ) + except AttributeError: + # applies at least for page not in writer + # as a backup solution, we put content as an object although not in accordance with pdf ref + # this will be fixed with the _add_object + self[NameObject(PG.CONTENTS)] = content + else: + content.indirect_reference = self[ + PG.CONTENTS + ].indirect_reference # TODO: in a future may required generation managment + try: + self.indirect_reference.pdf._objects[ + content.indirect_reference.idnum - 1 # type: ignore + ] = content + except AttributeError: + # applies at least for page not in writer + # as a backup solution, we put content as an object although not in accordance with pdf ref + # this will be fixed with the _add_object + self[NameObject(PG.CONTENTS)] = content + def merge_page(self, page2: "PageObject", expand: bool = False) -> None: """ Merge the content streams of two pages into one. @@ -1058,7 +1110,7 @@ def _merge_page( if expand: self._expand_mediabox(page2, ctm) - self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) + self.replace_contents(ContentStream(new_content_array, self.pdf)) self[NameObject(PG.RESOURCES)] = new_resources self[NameObject(PG.ANNOTS)] = new_annots @@ -1193,18 +1245,7 @@ def _merge_page_writer( if expand: self._expand_mediabox(page2, ctm) - if PG.CONTENTS not in self: - self[NameObject(PG.CONTENTS)] = pdf._add_object(ContentStream(None, pdf)) - ind = self.raw_get(PG.CONTENTS) - try: - if not isinstance(ind, IndirectObject): - raise KeyError - pdf._replace_object(ind, ContentStream(new_content_array, pdf)) - except KeyError: - self[NameObject(PG.CONTENTS)] = pdf._add_object( - ContentStream(new_content_array, pdf) - ) - + self.replace_contents(new_content_array) # self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, pdf) # self[NameObject(PG.RESOURCES)] = new_resources # self[NameObject(PG.ANNOTS)] = new_annots @@ -1545,7 +1586,7 @@ def add_transformation( if content is not None: content = PageObject._add_transformation_matrix(content, self.pdf, ctm) content = PageObject._push_pop_gs(content, self.pdf) - self[NameObject(PG.CONTENTS)] = content + self.replace_contents(content) # if expanding the page to fit a new page, calculate the new media box size if expand: corners = [ @@ -1704,9 +1745,7 @@ def compress_content_streams(self) -> None: if self.indirect_reference is not None and hasattr( self.indirect_reference.pdf, "_add_object" ): - self[ - NameObject(PG.CONTENTS) - ] = self.indirect_reference.pdf._add_object(content_obj) + self.replace_contents(content_obj) else: raise ValueError("Page must be part of a PdfWriter") diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 33f6dd5c2..16b6b1b04 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -242,6 +242,11 @@ def pdf_header(self, new_header: bytes) -> None: def _add_object(self, obj: PdfObject) -> IndirectObject: if hasattr(obj, "indirect_reference") and obj.indirect_reference.pdf == self: # type: ignore return obj.indirect_reference # type: ignore + # check for /Contents in Pages (/Contents in annotation are strings) + if isinstance(obj, DictionaryObject) and isinstance( + obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject) + ): + obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS]) self._objects.append(obj) obj.indirect_reference = IndirectObject(len(self._objects), 0, self) return obj.indirect_reference diff --git a/tests/test_page.py b/tests/test_page.py index 3cc09ff24..feaa86202 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -19,6 +19,7 @@ FloatObject, IndirectObject, NameObject, + NullObject, RectangleObject, TextStringObject, ) @@ -1177,7 +1178,6 @@ def test_image_new_property(): @pytest.mark.samples() -@pytest.mark.xfail(reason="issue #1897") def test_compression(): """Test for issue #1897""" @@ -1203,3 +1203,10 @@ def create_stamp_pdf() -> BytesIO: for page in writer.pages: page.compress_content_streams() assert len(writer._objects) == nb1 + 1 + + contents = writer.pages[0]["/Contents"] + writer.pages[0].replace_contents(None) + writer.pages[0].replace_contents(None) + assert isinstance( + writer._objects[contents.indirect_reference.idnum - 1], NullObject + )