Skip to content

Commit

Permalink
BUG : file expansion when updating with Page Contents
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Jun 21, 2023
1 parent ab42636 commit 3a2dce8
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 17 deletions.
65 changes: 48 additions & 17 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,50 @@ def getContents(self) -> Optional[ContentStream]: # deprecated
deprecation_with_replacement("getContents", "get_contents", "3.0.0")
return self.get_contents()

def replace_contents(self, content: Optional[ContentStream]) -> None:
"""
Replace the page contents with the new content and nullify old objects
Args:
content : new content. if None delete the content field.
"""
if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
for o in self[PG.CONTENTS]:
try:
self._objects[o.indirect_reference.idnum - 1] = NullObject()
except AttributeError:
pass
if content is None:
if PG.CONTENTS not in self:
return
else:
self.indirect_reference.pdf._objects[
self[PG.CONTENTS].indirect_reference.idnum - 1
] = NullObject()
del self[PG.CONTENTS]
elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
try:
self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(
content
)
except AttributeError:
# applies at least for page not in writer
# as a backup solution, we put content as an object although not in accordance with pdf ref
# this will be fixed with the _add_object
self[NameObject(PG.CONTENTS)] = content
else:
content.indirect_reference = self[
PG.CONTENTS
].indirect_reference # TODO: in a future may required generation managment
try:
self.indirect_reference.pdf._objects[
content.indirect_reference.idnum - 1
] = content
except AttributeError:
# applies at least for page not in writer
# as a backup solution, we put content as an object although not in accordance with pdf ref
# this will be fixed with the _add_object
self[NameObject(PG.CONTENTS)] = content

def merge_page(self, page2: "PageObject", expand: bool = False) -> None:
"""
Merge the content streams of two pages into one.
Expand Down Expand Up @@ -1058,7 +1102,7 @@ def _merge_page(
if expand:
self._expand_mediabox(page2, ctm)

self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf)
self.replace_contents(ContentStream(new_content_array, self.pdf))
self[NameObject(PG.RESOURCES)] = new_resources
self[NameObject(PG.ANNOTS)] = new_annots

Expand Down Expand Up @@ -1193,18 +1237,7 @@ def _merge_page_writer(
if expand:
self._expand_mediabox(page2, ctm)

if PG.CONTENTS not in self:
self[NameObject(PG.CONTENTS)] = pdf._add_object(ContentStream(None, pdf))
ind = self.raw_get(PG.CONTENTS)
try:
if not isinstance(ind, IndirectObject):
raise KeyError
pdf._replace_object(ind, ContentStream(new_content_array, pdf))
except KeyError:
self[NameObject(PG.CONTENTS)] = pdf._add_object(
ContentStream(new_content_array, pdf)
)

self.replace_contents(new_content_array)
# self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, pdf)
# self[NameObject(PG.RESOURCES)] = new_resources
# self[NameObject(PG.ANNOTS)] = new_annots
Expand Down Expand Up @@ -1545,7 +1578,7 @@ def add_transformation(
if content is not None:
content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
content = PageObject._push_pop_gs(content, self.pdf)
self[NameObject(PG.CONTENTS)] = content
self.replace_contents(content)
# if expanding the page to fit a new page, calculate the new media box size
if expand:
corners = [
Expand Down Expand Up @@ -1704,9 +1737,7 @@ def compress_content_streams(self) -> None:
if self.indirect_reference is not None and hasattr(
self.indirect_reference.pdf, "_add_object"
):
self[
NameObject(PG.CONTENTS)
] = self.indirect_reference.pdf._add_object(content_obj)
self.replace_contents(content_obj)
else:
raise ValueError("Page must be part of a PdfWriter")

Expand Down
5 changes: 5 additions & 0 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,11 @@ def pdf_header(self, new_header: bytes) -> None:
def _add_object(self, obj: PdfObject) -> IndirectObject:
if hasattr(obj, "indirect_reference") and obj.indirect_reference.pdf == self: # type: ignore
return obj.indirect_reference # type: ignore
# check for /Contents in Pages (/Contents in annotation are strings)
if isinstance(obj, DictionaryObject) and isinstance(
obj.get(PG.CONTENTS, None), (ArrayObject, DictionaryObject)
):
obj[NameObject(PG.CONTENTS)] = self._add_object(obj[PG.CONTENTS])
self._objects.append(obj)
obj.indirect_reference = IndirectObject(len(self._objects), 0, self)
return obj.indirect_reference
Expand Down
28 changes: 28 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,3 +1174,31 @@ def test_image_new_property():
with pytest.raises(KeyError):
reader.pages[0]._get_image(["test"], reader.pages[0])
assert list(PageObject(None, None).images) == []


@pytest.mark.samples()
def test_compression():
"""Test for issue #1897"""

def create_stamp_pdf() -> BytesIO:
from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_font("helvetica", "B", 16)
pdf.cell(40, 10, "Hello World!")
byte_string = pdf.output()
return BytesIO(byte_string)

template = PdfReader(create_stamp_pdf())
template_page = template.pages[0]
writer = PdfWriter()
writer.append(SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", [1])
nb1 = len(writer._objects)

for page in writer.pages:
page.merge_page(template_page)
assert len(writer._objects) == nb1 + 1 # font is added that's all
for page in writer.pages:
page.compress_content_streams()
assert len(writer._objects) == nb1 + 1

0 comments on commit 3a2dce8

Please sign in to comment.