From 80b014648ab9be9b8011ab31885e9c5d5cf856ca Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 18 Apr 2022 10:03:55 +0200 Subject: [PATCH] MAINT: Split long functions I hope this makes them easier to understand / extend --- PyPDF2/filters.py | 72 +++++++++--------- PyPDF2/generic.py | 11 ++- PyPDF2/merger.py | 113 ++++++++++++++-------------- PyPDF2/pdf.py | 170 +++++++++++++++++++++++-------------------- PyPDF2/utils.py | 2 +- Tests/test_merger.py | 17 +++++ 6 files changed, 213 insertions(+), 172 deletions(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index c0667013b..165dd68ce 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -145,45 +145,49 @@ def decode(data, decodeParms): columns = decodeParms[LZW.COLUMNS] # PNG prediction: if predictor >= 10 and predictor <= 15: - output = StringIO() - # PNG prediction can vary from row to row - rowlength = columns + 1 - assert len(data) % rowlength == 0 - prev_rowdata = (0,) * rowlength - for row in range(len(data) // rowlength): - rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]] - filterByte = rowdata[0] - if filterByte == 0: - pass - elif filterByte == 1: - for i in range(2, rowlength): - rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 - elif filterByte == 2: - for i in range(1, rowlength): - rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 - elif filterByte == 3: - for i in range(1, rowlength): - left = rowdata[i-1] if i > 1 else 0 - floor = math.floor(left + prev_rowdata[i])/2 - rowdata[i] = (rowdata[i] + int(floor)) % 256 - elif filterByte == 4: - for i in range(1, rowlength): - left = rowdata[i - 1] if i > 1 else 0 - up = prev_rowdata[i] - up_left = prev_rowdata[i - 1] if i > 1 else 0 - paeth = paethPredictor(left, up, up_left) - rowdata[i] = (rowdata[i] + paeth) % 256 - else: - # unsupported PNG filter - raise PdfReadError("Unsupported PNG filter %r" % filterByte) - prev_rowdata = rowdata - output.write(''.join([chr(x) for x in rowdata[1:]])) - data = output.getvalue() + data = FlateDecode._decode_png_prediction(data, columns) else: # unsupported predictor raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) return data + @staticmethod + def _decode_png_prediction(data, columns): + output = StringIO() + # PNG prediction can vary from row to row + rowlength = columns + 1 + assert len(data) % rowlength == 0 + prev_rowdata = (0,) * rowlength + for row in range(len(data) // rowlength): + rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]] + filterByte = rowdata[0] + if filterByte == 0: + pass + elif filterByte == 1: + for i in range(2, rowlength): + rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 + elif filterByte == 2: + for i in range(1, rowlength): + rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 + elif filterByte == 3: + for i in range(1, rowlength): + left = rowdata[i-1] if i > 1 else 0 + floor = math.floor(left + prev_rowdata[i])/2 + rowdata[i] = (rowdata[i] + int(floor)) % 256 + elif filterByte == 4: + for i in range(1, rowlength): + left = rowdata[i - 1] if i > 1 else 0 + up = prev_rowdata[i] + up_left = prev_rowdata[i - 1] if i > 1 else 0 + paeth = paethPredictor(left, up, up_left) + rowdata[i] = (rowdata[i] + paeth) % 256 + else: + # unsupported PNG filter + raise PdfReadError("Unsupported PNG filter %r" % filterByte) + prev_rowdata = rowdata + output.write(''.join([chr(x) for x in rowdata[1:]])) + return output.getvalue() + @staticmethod def encode(data): return compress(data) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 5f0b80ac9..3bb8a98e0 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -42,6 +42,7 @@ from PyPDF2.errors import ( STREAM_TRUNCATED_PREMATURELY, PdfReadError, + PdfReadWarning, PdfStreamError, ) @@ -595,11 +596,13 @@ def readFromStream(stream, pdf): data[key] = value elif pdf.strict: # multiple definitions of key not permitted - raise PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \ - % (utils.hexStr(stream.tell()), key)) + raise PdfReadError( + "Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key)) else: - warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \ - % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning) + warnings.warn( + "Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key), PdfReadWarning) pos = stream.tell() s = readNonWhitespace(stream) diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index 1f2ab2d6b..4a2271d68 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -367,7 +367,6 @@ def _write_dests(self): self.output.addNamedDestinationObject(v) def _write_bookmarks(self, bookmarks=None, parent=None): - if bookmarks is None: bookmarks = self.bookmarks @@ -382,64 +381,68 @@ def _write_bookmarks(self, bookmarks=None, parent=None): if '/Page' in b: for i, p in enumerate(self.pages): if p.id == b['/Page']: - # b[NameObject('/Page')] = p.out_pagedata - args = [NumberObject(p.id), NameObject(b['/Type'])] - # nothing more to add - # if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' - if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': - if '/Top' in b and not isinstance(b['/Top'], NullObject): - args.append(FloatObject(b['/Top'])) - else: - args.append(FloatObject(0)) - del b['/Top'] - elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': - if '/Left' in b and not isinstance(b['/Left'], NullObject): - args.append(FloatObject(b['/Left'])) - else: - args.append(FloatObject(0)) - del b['/Left'] - elif b['/Type'] == '/XYZ': - if '/Left' in b and not isinstance(b['/Left'], NullObject): - args.append(FloatObject(b['/Left'])) - else: - args.append(FloatObject(0)) - if '/Top' in b and not isinstance(b['/Top'], NullObject): - args.append(FloatObject(b['/Top'])) - else: - args.append(FloatObject(0)) - if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): - args.append(FloatObject(b['/Zoom'])) - else: - args.append(FloatObject(0)) - del b['/Top'], b['/Zoom'], b['/Left'] - elif b['/Type'] == '/FitR': - if '/Left' in b and not isinstance(b['/Left'], NullObject): - args.append(FloatObject(b['/Left'])) - else: - args.append(FloatObject(0)) - if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): - args.append(FloatObject(b['/Bottom'])) - else: - args.append(FloatObject(0)) - if '/Right' in b and not isinstance(b['/Right'], NullObject): - args.append(FloatObject(b['/Right'])) - else: - args.append(FloatObject(0)) - if '/Top' in b and not isinstance(b['/Top'], NullObject): - args.append(FloatObject(b['/Top'])) - else: - args.append(FloatObject(0)) - del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] - - b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) - - pageno = i - pdf = p.src # noqa: F841 + pageno, pdf = self._write_bookmark_on_page(b, p, i) break if pageno is not None: del b['/Page'], b['/Type'] last_added = self.output.addBookmarkDict(b, parent) + def _write_bookmark_on_page(self, b, p, i): + # b[NameObject('/Page')] = p.out_pagedata + args = [NumberObject(p.id), NameObject(b['/Type'])] + # nothing more to add + # if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' + if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Top'] + elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + del b['/Left'] + elif b['/Type'] == '/XYZ': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): + args.append(FloatObject(b['/Zoom'])) + else: + args.append(FloatObject(0)) + del b['/Top'], b['/Zoom'], b['/Left'] + elif b['/Type'] == '/FitR': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): + args.append(FloatObject(b['/Bottom'])) + else: + args.append(FloatObject(0)) + if '/Right' in b and not isinstance(b['/Right'], NullObject): + args.append(FloatObject(b['/Right'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] + + b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) + + pageno = i + pdf = p.src # noqa: F841 + return (pageno, pdf) + def _associate_dests_to_pages(self, pages): for nd in self.named_dests: pageno = None @@ -570,6 +573,6 @@ def add(self, title, pagenum): self.tree.addChild(bookmark) def removeAll(self): - for child in [x for x in self.tree.children()]: + for child in self.tree.children(): self.tree.removeChild(child) self.pop() diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 18120d328..47430221b 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -482,7 +482,6 @@ def write(self, stream): if hasattr(stream, 'mode') and 'b' not in stream.mode: warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name) debug = False - import struct if not self._root: self._root = self._addObject(self._root_object) @@ -512,7 +511,12 @@ def write(self, stream): self._sweepIndirectReferences(externalReferenceMap, self._root) del self.stack - # Begin writing: + object_positions = self._write_header(stream) + xref_location = self._write_xref_table(stream, object_positions) + self._write_trailer(stream) + stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) # eof + + def _write_header(self, stream): object_positions = [] stream.write(self._header + b_("\n")) stream.write(b_("%\xE2\xE3\xCF\xD3\n")) @@ -533,16 +537,18 @@ def write(self, stream): key = md5_hash[:min(16, len(self._encrypt_key) + 5)] obj.writeToStream(stream, key) stream.write(b_("\nendobj\n")) + return object_positions - # xref table + def _write_xref_table(self, stream, object_positions): xref_location = stream.tell() stream.write(b_("xref\n")) stream.write(b_("0 %s\n" % (len(self._objects) + 1))) stream.write(b_("%010d %05d f \n" % (0, 65535))) for offset in object_positions: stream.write(b_("%010d %05d n \n" % (offset, 0))) + return xref_location - # trailer + def _write_trailer(self, stream): stream.write(b_("trailer\n")) trailer = DictionaryObject() trailer.update({ @@ -556,9 +562,6 @@ def write(self, stream): trailer[NameObject(TK.ENCRYPT)] = self._encrypt trailer.writeToStream(stream, None) - # eof - stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) - def addMetadata(self, infos): """ Add custom metadata to the output. @@ -1399,10 +1402,10 @@ def getFormTextFields(self): ''' # Retrieve document form fields formfields = self.getFields() - return dict( - (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \ + return { + formfields[field]['/T']: formfields[field].get('/V') for field in formfields \ if formfields[field].get('/FT') == '/Tx' - ) + } def getNamedDestinations(self, tree=None, retval=None): """ @@ -1620,7 +1623,7 @@ def _flatten(self, pages=None, inherit=None, indirectRef=None): NameObject(PG.CROPBOX), NameObject(PG.ROTATE) ) if inherit is None: - inherit = dict() + inherit = {} if pages is None: # Fix issue 327: set flattenedPages attribute only for # decrypted file @@ -1707,8 +1710,7 @@ def _getObjectFromStream(self, indirectReference): def getObject(self, indirectReference): debug = False if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) - retval = self.cacheGetIndirectObject(indirectReference.generation, - indirectReference.idnum) + retval = self.cacheGetIndirectObject(indirectReference.generation, indirectReference.idnum) if retval is not None: return retval if indirectReference.generation == 0 and \ @@ -1723,13 +1725,15 @@ def getObject(self, indirectReference): if idnum != indirectReference.idnum and self.xrefIndex: # Xref table probably had bad indexes due to not being zero-indexed if self.strict: - raise PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ - % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + raise PdfReadError( + "Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) else: pass # xref table is corrected in non-strict mode elif idnum != indirectReference.idnum and self.strict: # some other problem - raise PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ - % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + raise PdfReadError( + "Expected object ID (%d %d) does not match actual (%d %d)." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) if self.strict: assert generation == indirectReference.generation retval = readObject(self.stream, self) @@ -1740,7 +1744,6 @@ def getObject(self, indirectReference): if not hasattr(self, '_decryption_key'): raise PdfReadError("file has not been decrypted") # otherwise, decrypt here... - import struct pack1 = struct.pack("= last_end - last_end = start + size - for num in range(start, start+size): - # The first entry is the type - xref_type = getEntry(0) - # The rest of the elements depend on the xref_type - if xref_type == 0: - # linked list of free objects - next_free_object = getEntry(1) # noqa: F841 - next_generation = getEntry(2) # noqa: F841 - elif xref_type == 1: - # objects that are in use but are not compressed - byte_offset = getEntry(1) - generation = getEntry(2) - if generation not in self.xref: - self.xref[generation] = {} - if not used_before(num, generation): - self.xref[generation][num] = byte_offset - if debug: print(("XREF Uncompressed: %s %s"%( - num, generation))) - elif xref_type == 2: - # compressed objects - objstr_num = getEntry(1) - obstr_idx = getEntry(2) - generation = 0 # PDF spec table 18, generation is 0 - if not used_before(num, generation): - if debug: print(("XREF Compressed: %s %s %s"%( - num, objstr_num, obstr_idx))) - self.xref_objStm[num] = (objstr_num, obstr_idx) - elif self.strict: - raise PdfReadError("Unknown xref type: %s"% - xref_type) + self._read_xref_subsections(idx_pairs, getEntry, used_before) trailerKeys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID for key in trailerKeys: @@ -2008,11 +1977,14 @@ def used_before(num, generation): # some PDFs have /Prev=0 in the trailer, instead of no /Prev if startxref == 0: if self.strict: - raise PdfReadError("/Prev=0 in the trailer (try" - " opening with strict=False)") + raise PdfReadError( + "/Prev=0 in the trailer (try" + " opening with strict=False)") else: - warnings.warn("/Prev=0 in the trailer - assuming there" - " is no previous xref table") + warnings.warn( + "/Prev=0 in the trailer - assuming there" + " is no previous xref table" + ) break # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an @@ -2053,8 +2025,40 @@ def used_before(num, generation): # if not, then either it's just plain wrong, or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was + def _read_xref_subsections(self, idx_pairs, getEntry, used_before): + last_end = 0 + for start, size in self._pairs(idx_pairs): + # The subsections must increase + assert start >= last_end + last_end = start + size + for num in range(start, start+size): + # The first entry is the type + xref_type = getEntry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = getEntry(1) # noqa: F841 + next_generation = getEntry(2) # noqa: F841 + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = getEntry(1) + generation = getEntry(2) + if generation not in self.xref: + self.xref[generation] = {} + if not used_before(num, generation): + self.xref[generation][num] = byte_offset + elif xref_type == 2: + # compressed objects + objstr_num = getEntry(1) + obstr_idx = getEntry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise PdfReadError("Unknown xref type: %s"% xref_type) + def _zeroXref(self, generation): - self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) + self.xref[generation] = {k-self.xrefIndex: v for (k, v) in list(self.xref[generation].items())} def _pairs(self, array): i = 0 @@ -2517,9 +2521,11 @@ def mergeScaledPage(self, page2, scale, expand=False): dimensions of the page to be merged. """ # CTM to scale : [ sx 0 0 sy 0 0 ] - return self.mergeTransformedPage(page2, [scale, 0, - 0, scale, - 0, 0], expand) + return self.mergeTransformedPage( + page2, + [scale, 0, 0, scale, 0, 0], + expand + ) def mergeRotatedPage(self, page2, rotation, expand=False): """ @@ -2550,9 +2556,11 @@ def mergeTranslatedPage(self, page2, tx, ty, expand=False): :param bool expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ - return self.mergeTransformedPage(page2, [1, 0, - 0, 1, - tx, ty], expand) + return self.mergeTransformedPage( + page2, + [1, 0, 0, 1, tx, ty], + expand + ) def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): """ @@ -2581,9 +2589,11 @@ def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): ctm = utils.matrixMultiply(translation, rotating) ctm = utils.matrixMultiply(ctm, rtranslation) - return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], - ctm[1][0], ctm[1][1], - ctm[2][0], ctm[2][1]], expand) + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand + ) def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): """ @@ -2606,10 +2616,11 @@ def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): [0, 0, 1]] ctm = utils.matrixMultiply(rotating, scaling) - return self.mergeTransformedPage(page2, - [ctm[0][0], ctm[0][1], - ctm[1][0], ctm[1][1], - ctm[2][0], ctm[2][1]], expand) + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand + ) def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): """ @@ -2633,9 +2644,11 @@ def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): [0, 0, 1]] ctm = utils.matrixMultiply(scaling, translation) - return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], - ctm[1][0], ctm[1][1], - ctm[2][0], ctm[2][1]], expand) + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand + ) def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False): """ @@ -2664,9 +2677,11 @@ def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expan ctm = utils.matrixMultiply(rotating, scaling) ctm = utils.matrixMultiply(ctm, translation) - return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], - ctm[1][0], ctm[1][1], - ctm[2][0], ctm[2][1]], expand) + return self.mergeTransformedPage( + page2, + [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], + expand + ) def addTransformation(self, ctm): """ @@ -3054,7 +3069,6 @@ def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr password = b_((str_(password) + str_(_encryption_padding))[:32]) # 2. Initialize the MD5 hash function and pass the result of step 1 as # input to this function. - import struct m = md5(password) # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash # function. diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py index e76bbb05e..6b3bace21 100644 --- a/PyPDF2/utils.py +++ b/PyPDF2/utils.py @@ -171,7 +171,7 @@ def __getitem__(self, index): def RC4_encrypt(key, plaintext): - S = [i for i in range(256)] + S = list(range(256)) j = 0 for i in range(256): j = (j + S[i] + ord_(key[i % len(key)])) % 256 diff --git a/Tests/test_merger.py b/Tests/test_merger.py index 072fbcb14..f31616608 100644 --- a/Tests/test_merger.py +++ b/Tests/test_merger.py @@ -2,6 +2,7 @@ import sys import PyPDF2 +from PyPDF2.generic import Destination TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -47,5 +48,21 @@ def test_merge(): file_merger.write(tmp_path) file_merger.close() + # Check if bookmarks are correct + pdfr = PyPDF2.PdfFileReader(tmp_path) + assert [el.title for el in pdfr.getOutlines() if isinstance(el, Destination)] == [ + "Foo", + "Bar", + "Baz", + "Foo", + "Bar", + "Baz", + "Foo", + "Bar", + "Baz", + "True", + "A bookmark", + ] + # Clean up os.remove(tmp_path)