From 0f7c8fe4face5e5cd3f4620a8cbd0e44b888698e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 5 Apr 2024 08:18:23 +0200 Subject: [PATCH] ROB: Tolerate "truncated" xref (#2580) Closes #2575. --- pypdf/_reader.py | 11 +++++++++++ tests/test_reader.py | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 54fb33f1b..034a0d091 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -677,6 +677,14 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: read_non_whitespace(stream) stream.seek(-1, 1) size = cast(int, read_object(stream, self)) + if not isinstance(size, int): + logger_warning( + "Invalid/Truncated xref table. Rebuilding it.", + __name__, + ) + self._rebuild_xref_table(stream) + stream.read() + return read_non_whitespace(stream) stream.seek(-1, 1) cnt = 0 @@ -815,6 +823,9 @@ def _read_xref_tables_and_trailers( def _read_xref(self, stream: StreamType) -> Optional[int]: self._read_standard_xref_table(stream) + if stream.read(1) == b"": + return None + stream.seek(-1, 1) read_non_whitespace(stream) stream.seek(-1, 1) new_trailer = cast(Dict[str, Any], read_object(stream, self)) diff --git a/tests/test_reader.py b/tests/test_reader.py index f8be72dc1..a346ca76d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1508,3 +1508,11 @@ def test_corrupted_xref(): name = "iss2516.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.root_object["/Type"] == "/Catalog" + + +@pytest.mark.enable_socket() +def test_truncated_xref(caplog): + url = "https://github.com/py-pdf/pypdf/files/14843553/002-trivial-libre-office-writer-broken.pdf" + name = "iss2575.pdf" + PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert "Invalid/Truncated xref table. Rebuilding it." in caplog.text