From 4daade64be740acb5ade684f0f28abb45cbb8cf3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 19 Jul 2022 23:28:55 +0200 Subject: [PATCH] ROB : cope with invalid parent xref (#1089) rebuild the xref if the parent chained xref is invalid --- PyPDF2/_reader.py | 9 +++++++++ tests/test_reader.py | 11 +++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index ac92e4e16..15bb2e7c3 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1317,6 +1317,15 @@ def read(self, stream: StreamType) -> None: if found: continue # no xref table found at specified location + if "/Root" in self.trailer and not self.strict: + # if Root has been already found, just raise warning + warnings.warn("Invalid parent xref., rebuild xref", PdfReadWarning) + try: + self._rebuild_xref_table(stream) + break + except Exception: + raise PdfReadError("can not rebuild xref") + break raise PdfReadError("Could not find xref table at specified location") # if not zero-indexed, verify that the table is correct; change it if necessary if self.xref_index and not self.strict: diff --git a/tests/test_reader.py b/tests/test_reader.py index 7171953c7..7336549f0 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -770,12 +770,12 @@ def test_get_fields(): assert dict(fields["c1-1"]) == ({"/FT": "/Btn", "/T": "c1-1"}) +# covers also issue 1089 +@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning") def test_get_fields_read_else_block(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/934/934771.pdf" name = "tika-934771.pdf" - with pytest.raises(PdfReadError) as exc: - PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - assert exc.value.args[0] == "Could not find xref table at specified location" + PdfReader(BytesIO(get_pdf_from_url(url, name=name))) def test_get_fields_read_else_block2(): @@ -786,12 +786,11 @@ def test_get_fields_read_else_block2(): assert fields is None +@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning") def test_get_fields_read_else_block3(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957721.pdf" name = "tika-957721.pdf" - with pytest.raises(PdfReadError) as exc: - PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - assert exc.value.args[0] == "Could not find xref table at specified location" + PdfReader(BytesIO(get_pdf_from_url(url, name=name))) def test_metadata_is_none():