From 8f3bcc175c18142f1802b10a7f400b7460160c85 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 4 Apr 2024 22:33:49 +0200 Subject: [PATCH 1/2] ENH: Tolerate "truncated" xref closes #2575 --- pypdf/_reader.py | 11 +++++++++++ tests/test_reader.py | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 54fb33f1b..deb7d3281 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -677,6 +677,14 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: read_non_whitespace(stream) stream.seek(-1, 1) size = cast(int, read_object(stream, self)) + if not isinstance(size, int): + logger_warning( + "Invalid/Truncated xref table. Rebuild xref table", + __name__, + ) + self._rebuild_xref_table(stream) + stream.read() + return read_non_whitespace(stream) stream.seek(-1, 1) cnt = 0 @@ -815,6 +823,9 @@ def _read_xref_tables_and_trailers( def _read_xref(self, stream: StreamType) -> Optional[int]: self._read_standard_xref_table(stream) + if stream.read(1) == b"": + return None + stream.seek(-1, 1) read_non_whitespace(stream) stream.seek(-1, 1) new_trailer = cast(Dict[str, Any], read_object(stream, self)) diff --git a/tests/test_reader.py b/tests/test_reader.py index f8be72dc1..3bfeee7e2 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1508,3 +1508,11 @@ def test_corrupted_xref(): name = "iss2516.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.root_object["/Type"] == "/Catalog" + + +@pytest.mark.enable_socket() +def test_truncated_xref(caplog): + url = "https://github.com/py-pdf/pypdf/files/14843553/002-trivial-libre-office-writer-broken.pdf" + name = "iss2575.pdf" + PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert "Invalid/Truncated xref table. Rebuild xref table" in caplog.text From 6e30c6f09771a8c90643d9213b8edc02fbb7cee4 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Fri, 5 Apr 2024 08:10:42 +0200 Subject: [PATCH 2/2] improve wording --- pypdf/_reader.py | 2 +- tests/test_reader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index deb7d3281..034a0d091 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -679,7 +679,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: size = cast(int, read_object(stream, self)) if not isinstance(size, int): logger_warning( - "Invalid/Truncated xref table. Rebuild xref table", + "Invalid/Truncated xref table. Rebuilding it.", __name__, ) self._rebuild_xref_table(stream) diff --git a/tests/test_reader.py b/tests/test_reader.py index 3bfeee7e2..a346ca76d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1515,4 +1515,4 @@ def test_truncated_xref(caplog): url = "https://github.com/py-pdf/pypdf/files/14843553/002-trivial-libre-office-writer-broken.pdf" name = "iss2575.pdf" PdfReader(BytesIO(get_data_from_url(url, name=name))) - assert "Invalid/Truncated xref table. Rebuild xref table" in caplog.text + assert "Invalid/Truncated xref table. Rebuilding it." in caplog.text