From 19efa1d9a52c88328b49b6e1ec0dfe3e748e7176 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 20 Apr 2022 22:30:55 +0200 Subject: [PATCH 1/6] Fix #297 : fix corruption in startxref or xref table Proposal for #297 Issue is due to corrupted startxref pointer and/or xref table Fixed proposed if incorrect pointer look for all objs to rebuild an xref table Tests updated --- PyPDF2/pdf.py | 57 +++++++++++++++++++++++++++++++++++++++++++- Tests/test_reader.py | 19 ++++++--------- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 47430221b..f5fcab3a1 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1843,6 +1843,36 @@ def read(self, stream): if line[:9] != b_("startxref"): raise PdfReadError("startxref not found") + #check and eventually correct the startxref only in not strict + rebuildXrefTable = False + try: + stream.seek(startxref - 1,0) #-1 to check character before + line=stream.read(1) + if line not in b_("\r\n \t"): + raise UserWarning("incorrect startxref pointer(1)",line) + line = stream.read(4) + if line != b_("xref"): + #not an xref so check if it is an XREF object + line = b_("") + while line in b_("0123456789 \t"): + line = stream.read(1) + if line == b_(""): + raise UserWarning("incorrect startxref pointer(2)") + line += stream.read(2) #1 char already read, +2 to check "obj" + if line.lower() != b_("obj"): + raise UserWarning("incorrect startxref pointer(3)") + while stream.read(1) in b_(" \t\r\n"): + pass; + line=stream.read(256) # check that it is xref obj + if b_("/xref") not in line.lower(): + raise UserWarning("incorrect startxref pointer(4)") + except UserWarning as e: + #import traceback;print(traceback.format_exc()) + warnings.warn(str(e)+", need to rebuild xref table") + if( not self.strict): + rebuildXrefTable = True + else: + raise # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} @@ -1927,7 +1957,32 @@ def read(self, stream): if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: - break + break + elif rebuildXrefTable: + self.xref={} + p_ = stream.tell() + stream.seek(0,0) + f_ = stream.read(-1) + import re + for m in re.finditer(b_("[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_): + idnum = int(m.group(1)) + generation = int(m.group(2)) + if generation not in self.xref: + self.xref[generation] = {} + self.xref[generation][idnum] = m.start(1) + trailerPos = f_.rfind(b"trailer") - len(f_) + 7 + stream.seek(trailerPos,2) + #code below duplicated + readNonWhitespace(stream) + stream.seek(-1, 1) + newTrailer = readObject(stream, self) + for key, value in list(newTrailer.items()): + if key not in self.trailer: + self.trailer[key] = value + if "/Prev" in newTrailer: + startxref = newTrailer["/Prev"] + else: + break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 0d1d6e906..720a5483f 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -200,7 +200,7 @@ def test_get_images_raw(strict, with_prev_0, should_fail): pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), b"/Prev 0 " if with_prev_0 else b"", - pdf_data.find(b"xref"), + pdf_data.find(b"xref") - 1, # -1 due to double % at the beginning ) pdf_stream = io.BytesIO(pdf_data) if should_fail: @@ -214,16 +214,11 @@ def test_get_images_raw(strict, with_prev_0, should_fail): PdfFileReader(pdf_stream, strict=strict) -@pytest.mark.xfail( - reason=( - "It's still broken - and unclear what the issue is. " - "Help would be appreciated!" - ) -) def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") - reader = PdfFileReader(path, "rb") - reader.getPage(0) + with pytest.raises(UserWarning): + reader = PdfFileReader(path, "rb") + reader.getPage(0) def test_get_page_of_encrypted_file(): @@ -353,7 +348,7 @@ def test_read_prev_0_trailer(): pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), b"/Prev 0 " if with_prev_0 else b"", - pdf_data.find(b"xref"), + pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: @@ -388,7 +383,7 @@ def test_read_missing_startxref(): pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), - # pdf_data.find(b"xref"), + # pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: @@ -424,7 +419,7 @@ def test_read_unknown_zero_pages(): pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), - pdf_data.find(b"xref"), + pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: From 706d4c56339b3f785219ccfa67efb47083a47322 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 20 Apr 2022 23:35:15 +0200 Subject: [PATCH 2/6] cleanup flake8 report --- PyPDF2/pdf.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index f5fcab3a1..0e9e406e6 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1847,7 +1847,7 @@ def read(self, stream): rebuildXrefTable = False try: stream.seek(startxref - 1,0) #-1 to check character before - line=stream.read(1) + line=stream.read(1) if line not in b_("\r\n \t"): raise UserWarning("incorrect startxref pointer(1)",line) line = stream.read(4) @@ -1862,7 +1862,7 @@ def read(self, stream): if line.lower() != b_("obj"): raise UserWarning("incorrect startxref pointer(3)") while stream.read(1) in b_(" \t\r\n"): - pass; + pass; line=stream.read(256) # check that it is xref obj if b_("/xref") not in line.lower(): raise UserWarning("incorrect startxref pointer(4)") @@ -1872,7 +1872,7 @@ def read(self, stream): if( not self.strict): rebuildXrefTable = True else: - raise + raise # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} @@ -1957,20 +1957,19 @@ def read(self, stream): if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: - break + break elif rebuildXrefTable: self.xref={} - p_ = stream.tell() stream.seek(0,0) f_ = stream.read(-1) import re - for m in re.finditer(b_("[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_): + for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"),f_): idnum = int(m.group(1)) generation = int(m.group(2)) if generation not in self.xref: self.xref[generation] = {} self.xref[generation][idnum] = m.start(1) - trailerPos = f_.rfind(b"trailer") - len(f_) + 7 + trailerPos = f_.rfind(b"trailer") - len(f_) + 7 stream.seek(trailerPos,2) #code below duplicated readNonWhitespace(stream) @@ -1982,7 +1981,7 @@ def read(self, stream): if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: - break + break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) From e61e4696dbb71dc41d44ed1f94866e96b2ab502e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 23 Apr 2022 13:23:46 +0200 Subject: [PATCH 3/6] review Test cf comments in https://github.com/py-pdf/PyPDF2/pull/788#discussion_r856328104 Test in strict=False and True; use PdfReadWarning instead of UserWarniing to be consistant with other raising --- PyPDF2/pdf.py | 13 ++++++------- Tests/test_reader.py | 10 +++++++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 0e9e406e6..149e47ad3 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1849,7 +1849,7 @@ def read(self, stream): stream.seek(startxref - 1,0) #-1 to check character before line=stream.read(1) if line not in b_("\r\n \t"): - raise UserWarning("incorrect startxref pointer(1)",line) + raise PdfReadWarning("incorrect startxref pointer(1)",line) line = stream.read(4) if line != b_("xref"): #not an xref so check if it is an XREF object @@ -1857,18 +1857,17 @@ def read(self, stream): while line in b_("0123456789 \t"): line = stream.read(1) if line == b_(""): - raise UserWarning("incorrect startxref pointer(2)") + raise PdfReadWarning("incorrect startxref pointer(2)") line += stream.read(2) #1 char already read, +2 to check "obj" if line.lower() != b_("obj"): - raise UserWarning("incorrect startxref pointer(3)") + raise PdfReadWarning("incorrect startxref pointer(3)") while stream.read(1) in b_(" \t\r\n"): pass; line=stream.read(256) # check that it is xref obj if b_("/xref") not in line.lower(): - raise UserWarning("incorrect startxref pointer(4)") - except UserWarning as e: - #import traceback;print(traceback.format_exc()) - warnings.warn(str(e)+", need to rebuild xref table") + raise PdfReadWarning("incorrect startxref pointer(4)") + except PdfReadWarning as e: + warnings.warn(str(e)+", need to rebuild xref table (strict=False)",PdfReadWarning) if( not self.strict): rebuildXrefTable = True else: diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 720a5483f..852f4a567 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -7,7 +7,7 @@ from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG from PyPDF2.constants import Ressources as RES -from PyPDF2.errors import PdfReadError +from PyPDF2.errors import PdfReadError,PdfReadWarning from PyPDF2.filters import _xobj_to_image TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -216,9 +216,13 @@ def test_get_images_raw(strict, with_prev_0, should_fail): def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") - with pytest.raises(UserWarning): - reader = PdfFileReader(path, "rb") + with pytest.raises(PdfReadWarning) as exc: + print(exc) + reader = PdfFileReader(path,strict=True) reader.getPage(0) + assert ( exc.value.args[0].find("startxref") > 0) + reader = PdfFileReader(path,strict=False) + reader.getPage(0) def test_get_page_of_encrypted_file(): From cd2df385d0fd91c03513f190cb0f79db4f57958d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 24 Apr 2022 10:35:36 +0200 Subject: [PATCH 4/6] reformatting with black applied only on test_reader as recommanded --- Tests/test_reader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 852f4a567..7052013bd 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -7,7 +7,7 @@ from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG from PyPDF2.constants import Ressources as RES -from PyPDF2.errors import PdfReadError,PdfReadWarning +from PyPDF2.errors import PdfReadError, PdfReadWarning from PyPDF2.filters import _xobj_to_image TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -200,7 +200,7 @@ def test_get_images_raw(strict, with_prev_0, should_fail): pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), b"/Prev 0 " if with_prev_0 else b"", - pdf_data.find(b"xref") - 1, # -1 due to double % at the beginning + pdf_data.find(b"xref") - 1, # -1 due to double % at the beginning ) pdf_stream = io.BytesIO(pdf_data) if should_fail: @@ -218,10 +218,10 @@ def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") with pytest.raises(PdfReadWarning) as exc: print(exc) - reader = PdfFileReader(path,strict=True) + reader = PdfFileReader(path, strict=True) reader.getPage(0) - assert ( exc.value.args[0].find("startxref") > 0) - reader = PdfFileReader(path,strict=False) + assert exc.value.args[0].find("startxref") > 0 + reader = PdfFileReader(path, strict=False) reader.getPage(0) From c7ecd8ce7e4a506e07ef8c2f5d59cab42e0c99a1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 24 Apr 2022 16:27:00 +0200 Subject: [PATCH 5/6] extend test Test completed to cover old cases loop detected and fix if rebuilding xref table --- PyPDF2/pdf.py | 8 ++++---- Tests/test_reader.py | 36 ++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 149e47ad3..33fe7133d 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1977,10 +1977,10 @@ def read(self, stream): for key, value in list(newTrailer.items()): if key not in self.trailer: self.trailer[key] = value - if "/Prev" in newTrailer: - startxref = newTrailer["/Prev"] - else: - break + #if "/Prev" in newTrailer: + # startxref = newTrailer["/Prev"] + #else: + break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 7052013bd..927182805 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -164,15 +164,19 @@ def test_get_images(src, nb_images): @pytest.mark.parametrize( - "strict,with_prev_0,should_fail", + "strict,with_prev_0,startx_correction,should_fail", [ - (True, True, True), - (True, False, False), - (False, True, False), - (False, False, False), + (True, False, -1, False), # all nominal => no fail + (True, True, -1, True), # Prev=0 => fail expected + (False, False, -1, False), + (False, True, -1, False), # Prev =0 => no strict so tolerant + (True, False, 0, True), # error on startxref, in strict => fail expected + (True, True, 0, True), + (False, False, 0, False), # error on startxref, but no strict => xref rebuilt,no fail + (False, True, 0, False), ], ) -def test_get_images_raw(strict, with_prev_0, should_fail): +def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" @@ -200,16 +204,24 @@ def test_get_images_raw(strict, with_prev_0, should_fail): pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), b"/Prev 0 " if with_prev_0 else b"", - pdf_data.find(b"xref") - 1, # -1 due to double % at the beginning + # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation + pdf_data.find(b"xref") + startx_correction, ) + print((strict, with_prev_0, should_fail)) + print(pdf_data) pdf_stream = io.BytesIO(pdf_data) if should_fail: - with pytest.raises(PdfReadError) as exc: + with pytest.raises(Exception) as exc: PdfFileReader(pdf_stream, strict=strict) - assert ( - exc.value.args[0] - == "/Prev=0 in the trailer (try opening with strict=False)" - ) + print(exc.type) + if startx_correction != -1: + assert exc.type == PdfReadWarning + else: + assert ( + exc.type == PdfReadError + and exc.value.args[0] + == "/Prev=0 in the trailer (try opening with strict=False)" + ) else: PdfFileReader(pdf_stream, strict=strict) From 28522480349212d48c6d6ddade56280b68bfa53b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 24 Apr 2022 17:04:25 +0200 Subject: [PATCH 6/6] cleanup https://github.com/py-pdf/PyPDF2/pull/788#discussion_r857136971 --- Tests/test_reader.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 927182805..ff7ae6aa0 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -207,13 +207,10 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation pdf_data.find(b"xref") + startx_correction, ) - print((strict, with_prev_0, should_fail)) - print(pdf_data) pdf_stream = io.BytesIO(pdf_data) if should_fail: with pytest.raises(Exception) as exc: PdfFileReader(pdf_stream, strict=strict) - print(exc.type) if startx_correction != -1: assert exc.type == PdfReadWarning else: @@ -229,10 +226,9 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") with pytest.raises(PdfReadWarning) as exc: - print(exc) reader = PdfFileReader(path, strict=True) reader.getPage(0) - assert exc.value.args[0].find("startxref") > 0 + assert "startxref" in exc.value.args[0] reader = PdfFileReader(path, strict=False) reader.getPage(0)