From 6a812f38145e52f6d8102f69d8aa082c7a91130c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 23 May 2023 11:04:37 +0200 Subject: [PATCH 1/5] ENH add PageNumber property add a new property to ease access to page number on all objects --- pypdf/_page.py | 17 +++++++++++++++++ tests/test_page.py | 12 +++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 4d53d1b54..30336bf99 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1519,6 +1519,23 @@ def compressContentStreams(self) -> None: # deprecated ) self.compress_content_streams() + @property + def page_number(self) -> int: + """ + Read-only property which return the page number with the pdf file. + + Returns: + int : page number ; -1 if the page is not attached to a pdf + """ + if self.indirect_reference is None: + return -1 + else: + try: + lst = self.indirect_reference.pdf.pages + return lst.index(self) + except ValueError: + return -1 + def _debug_for_extract(self) -> str: # pragma: no cover out = "" for ope, op in ContentStream( diff --git a/tests/test_page.py b/tests/test_page.py index a7fa50364..4704d707e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -261,13 +261,18 @@ def test_page_transformations(): ) def test_compress_content_streams(pdf_path, password): reader = PdfReader(pdf_path) + writer = PdfWriter() if password: reader.decrypt(password) + for i, page in enumerate(reader.pages): + assert i == page.page_number + assert isinstance(reader.pages[0].get_contents(), ContentStream) writer.clone_document_from_reader(reader) assert isinstance(writer.pages[0].get_contents(), ContentStream) - for page in writer.pages: + for i, page in enumerate(writer.pages): + assert i == page.page_number page.compress_content_streams() # test from reader should fail as adding_object out of @@ -1119,6 +1124,11 @@ def test_merge_transformed_page_into_blank(): True, True, ) + blank = PageObject.create_blank_page(width=100, height=100) + assert blank.page_number == -1 + inserted_blank = w.add_page(blank) + assert blank.page_number == -1 # the inserted page is a clone + assert inserted_blank.page_number == len(w.pages) - 1 def test_pages_printing(): From d0c9c62a098aaf4b1c07522339fd6180510cce9b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 23 May 2023 11:27:27 +0200 Subject: [PATCH 2/5] complete test --- tests/test_page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_page.py b/tests/test_page.py index 4704d707e..f72f6d580 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1129,6 +1129,8 @@ def test_merge_transformed_page_into_blank(): inserted_blank = w.add_page(blank) assert blank.page_number == -1 # the inserted page is a clone assert inserted_blank.page_number == len(w.pages) - 1 + del w._pages.get_object()["/Kids"][-1] + assert inserted_blank.page_number == -1 def test_pages_printing(): From 043da6622e113370708957a06b774cbc217e4f2c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 23 May 2023 11:43:27 +0200 Subject: [PATCH 3/5] BUG: Append pdf with named destination using numbers for pages closes #471 the issue was with named destination using numbers instead of indirect object to point pages. This is normally not expected. --- pypdf/_writer.py | 21 +++++++++++++++++++-- tests/test_writer.py | 17 +++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e3968e96c..03fb17f8a 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -2755,8 +2755,25 @@ def merge( ) # need for the outline processing below for dest in reader._namedDests.values(): arr = dest.dest_array - if isinstance(dest["/Page"], NullObject): - pass # self.add_named_destination_array(dest["/Title"],arr) + if ( # noqa: SIM114 + "/Names" in self._root_object + and dest["/Title"] in self._root_object["/Names"]["/Dests"]["/Names"] + ): + # already exists : should not duplicate it + pass + elif isinstance(dest["/Page"], NullObject): + pass + elif isinstance(dest["/Page"], int): + # the page reference is a page number normally not iaw Pdf Reference + # page numbers as int are normally accepted only in external goto + p = reader.pages[dest["/Page"]] + try: + arr[NumberObject(0)] = NumberObject( + srcpages[p.indirect_reference.idnum].page_number + ) + self.add_named_destination_array(dest["/Title"], arr) + except KeyError: + pass elif dest["/Page"].indirect_reference.idnum in srcpages: arr[NumberObject(0)] = srcpages[ dest["/Page"].indirect_reference.idnum diff --git a/tests/test_writer.py b/tests/test_writer.py index 80a3158aa..5b07432bb 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1339,3 +1339,20 @@ def test_iss1767(): name = "iss1723.pdf" in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) PdfWriter(clone_from=in_pdf) + + +@pytest.mark.enable_socket() +def test_named_dest_page_number(): + """ + Closes iss471 + tests appending with named destinations as integers + """ + url = "https://github.com/py-pdf/pypdf/files/10704333/central.pdf" + name = "central.pdf" + w = PdfWriter() + w.add_blank_page(100, 100) + w.append(BytesIO(get_pdf_from_url(url, name=name)), pages=[0, 1, 2]) + assert len(w._root_object["/Names"]["/Dests"]["/Names"]) == 2 + assert w._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) + w.append(BytesIO(get_pdf_from_url(url, name=name))) + assert len(w._root_object["/Names"]["/Dests"]["/Names"]) == 6 From 99654a47ab76a3f1f6d68833123b3ced5a67df42 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 23 May 2023 12:19:11 +0200 Subject: [PATCH 4/5] mypy --- pypdf/_writer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 03fb17f8a..ce8b670eb 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -2755,9 +2755,12 @@ def merge( ) # need for the outline processing below for dest in reader._namedDests.values(): arr = dest.dest_array - if ( # noqa: SIM114 - "/Names" in self._root_object - and dest["/Title"] in self._root_object["/Names"]["/Dests"]["/Names"] + if "/Names" in self._root_object and dest["/Title"] in cast( # noqa: SIM114 + list, + cast( + DictionaryObject, + cast(DictionaryObject, self._root_object["/Names"])["/Dests"], + )["/Names"], ): # already exists : should not duplicate it pass @@ -2767,6 +2770,7 @@ def merge( # the page reference is a page number normally not iaw Pdf Reference # page numbers as int are normally accepted only in external goto p = reader.pages[dest["/Page"]] + assert p.indirect_reference is not None try: arr[NumberObject(0)] = NumberObject( srcpages[p.indirect_reference.idnum].page_number From 1575a8a8290591706403325d22a6956345258e66 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 23 May 2023 12:52:42 +0200 Subject: [PATCH 5/5] test coverage --- tests/test_writer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index 5b07432bb..297520ec4 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -21,6 +21,7 @@ Fit, IndirectObject, NameObject, + NullObject, NumberObject, RectangleObject, StreamObject, @@ -1356,3 +1357,12 @@ def test_named_dest_page_number(): assert w._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) w.append(BytesIO(get_pdf_from_url(url, name=name))) assert len(w._root_object["/Names"]["/Dests"]["/Names"]) == 6 + w2 = PdfWriter() + w2.add_blank_page(100, 100) + dest = w2.add_named_destination("toto", 0) + dest.get_object()[NameObject("/D")][0] = NullObject() + b = BytesIO() + w2.write(b) + b.seek(0) + w.append(b) + assert len(w._root_object["/Names"]["/Dests"]["/Names"]) == 6