BUG : file expansion when updating with Page Contents

closes py-pdf#1897
pubpub-zz · Jun 21, 2023 · 3a2dce8 · 3a2dce8
1 parent ab42636
commit 3a2dce8
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 17 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -920,6 +920,50 @@ def getContents(self) -> Optional[ContentStream]:  # deprecated
         deprecation_with_replacement("getContents", "get_contents", "3.0.0")
         return self.get_contents()
 
+    def replace_contents(self, content: Optional[ContentStream]) -> None:
+        """
+        Replace the page contents with the new content and nullify old objects
+        Args:
+            content : new content. if None delete the content field.
+        """
+        if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
+            for o in self[PG.CONTENTS]:
+                try:
+                    self._objects[o.indirect_reference.idnum - 1] = NullObject()
+                except AttributeError:
+                    pass
+        if content is None:
+            if PG.CONTENTS not in self:
+                return
+            else:
+                self.indirect_reference.pdf._objects[
+                    self[PG.CONTENTS].indirect_reference.idnum - 1
+                ] = NullObject()
+                del self[PG.CONTENTS]
+        elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
+            try:
+                self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(
+                    content
+                )
+            except AttributeError:
+                # applies at least for page not in writer
+                # as a backup solution, we put content as an object although not in accordance with pdf ref
+                # this will be fixed with the _add_object
+                self[NameObject(PG.CONTENTS)] = content
+        else:
+            content.indirect_reference = self[
+                PG.CONTENTS
+            ].indirect_reference  # TODO: in a future may required generation managment
+            try:
+                self.indirect_reference.pdf._objects[
+                    content.indirect_reference.idnum - 1
+                ] = content
+            except AttributeError:
+                # applies at least for page not in writer
+                # as a backup solution, we put content as an object although not in accordance with pdf ref
+                # this will be fixed with the _add_object
+                self[NameObject(PG.CONTENTS)] = content
+
     def merge_page(self, page2: "PageObject", expand: bool = False) -> None:
         """
         Merge the content streams of two pages into one.
@@ -1058,7 +1102,7 @@ def _merge_page(
         if expand:
             self._expand_mediabox(page2, ctm)
 
-        self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf)
+        self.replace_contents(ContentStream(new_content_array, self.pdf))
         self[NameObject(PG.RESOURCES)] = new_resources
         self[NameObject(PG.ANNOTS)] = new_annots
 
@@ -1193,18 +1237,7 @@ def _merge_page_writer(
         if expand:
             self._expand_mediabox(page2, ctm)
 
-        if PG.CONTENTS not in self:
-            self[NameObject(PG.CONTENTS)] = pdf._add_object(ContentStream(None, pdf))
-        ind = self.raw_get(PG.CONTENTS)
-        try:
-            if not isinstance(ind, IndirectObject):
-                raise KeyError
-            pdf._replace_object(ind, ContentStream(new_content_array, pdf))
-        except KeyError:
-            self[NameObject(PG.CONTENTS)] = pdf._add_object(
-                ContentStream(new_content_array, pdf)
-            )
-
+        self.replace_contents(new_content_array)
         # self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, pdf)
         # self[NameObject(PG.RESOURCES)] = new_resources
         # self[NameObject(PG.ANNOTS)] = new_annots
@@ -1545,7 +1578,7 @@ def add_transformation(
         if content is not None:
             content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
             content = PageObject._push_pop_gs(content, self.pdf)
-            self[NameObject(PG.CONTENTS)] = content
+            self.replace_contents(content)
         # if expanding the page to fit a new page, calculate the new media box size
         if expand:
             corners = [
@@ -1704,9 +1737,7 @@ def compress_content_streams(self) -> None:
                 if self.indirect_reference is not None and hasattr(
                     self.indirect_reference.pdf, "_add_object"
                 ):
-                    self[
-                        NameObject(PG.CONTENTS)
-                    ] = self.indirect_reference.pdf._add_object(content_obj)
+                    self.replace_contents(content_obj)
                 else:
                     raise ValueError("Page must be part of a PdfWriter")
 

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -242,6 +242,11 @@ def pdf_header(self, new_header: bytes) -> None:
     def _add_object(self, obj: PdfObject) -> IndirectObject:
         if hasattr(obj, "indirect_reference") and obj.indirect_reference.pdf == self:  # type: ignore
             return obj.indirect_reference  # type: ignore
+        # check for /Contents in Pages (/Contents in annotation are strings)
+        if isinstance(obj, DictionaryObject) and isinstance(
+            obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
+        ):
+            obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
         self._objects.append(obj)
         obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
         return obj.indirect_reference

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -1174,3 +1174,31 @@ def test_image_new_property():
     with pytest.raises(KeyError):
         reader.pages[0]._get_image(["test"], reader.pages[0])
     assert list(PageObject(None, None).images) == []
+
+
+@pytest.mark.samples()
+def test_compression():
+    """Test for issue #1897"""
+
+    def create_stamp_pdf() -> BytesIO:
+        from fpdf import FPDF
+
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.set_font("helvetica", "B", 16)
+        pdf.cell(40, 10, "Hello World!")
+        byte_string = pdf.output()
+        return BytesIO(byte_string)
+
+    template = PdfReader(create_stamp_pdf())
+    template_page = template.pages[0]
+    writer = PdfWriter()
+    writer.append(SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", [1])
+    nb1 = len(writer._objects)
+
+    for page in writer.pages:
+        page.merge_page(template_page)
+    assert len(writer._objects) == nb1 + 1  # font is added that's all
+    for page in writer.pages:
+        page.compress_content_streams()
+    assert len(writer._objects) == nb1 + 1