diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 094aa1430..903ef7f97 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -1217,39 +1217,43 @@ def decode_pdfdocencoding(byte_array): retval += c return retval +# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7 +# C.1 Predefined encodings sorted by character name of another PDF reference +# Some indices have '\u0000' although they should have something else: +# 22: should be '\u0017' _pdfDocEncoding = ( - u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), - u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), - u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), - u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), - u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), - u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), - u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), - u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), - u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), - u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), - u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), - u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), - u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), - u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), - u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), - u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), - u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), - u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), - u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), - u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), - u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), - u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), - u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), - u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), - u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), - u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), - u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), - u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), - u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), - u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), - u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), - u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') + u_('\u0000'), u_('\u0001'), u_('\u0002'), u_('\u0003'), u_('\u0004'), u_('\u0005'), u_('\u0006'), u_('\u0007'), # 0 - 7 + u_('\u0008'), u_('\u0009'), u_('\u000a'), u_('\u000b'), u_('\u000c'), u_('\u000d'), u_('\u000e'), u_('\u000f'), # 8 - 15 + u_('\u0010'), u_('\u0011'), u_('\u0012'), u_('\u0013'), u_('\u0014'), u_('\u0015'), u_('\u0000'), u_('\u0017'), # 16 - 23 + u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), # 24 - 31 + u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), # 32 - 39 + u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), # 40 - 47 + u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), # 48 - 55 + u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), # 56 - 63 + u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), # 64 - 71 + u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), # 72 - 79 + u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), # 80 - 87 + u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), # 88 - 95 + u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), # 96 - 103 + u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), # 104 - 111 + u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), # 112 - 119 + u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), # 120 - 127 + u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), # 128 - 135 + u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), # 136 - 143 + u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), # 144 - 151 + u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), # 152 - 159 + u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), # 160 - 167 + u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), # 168 - 175 + u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), # 176 - 183 + u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), # 184 - 191 + u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), # 192 - 199 + u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), # 200 - 207 + u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), # 208 - 215 + u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), # 216 - 223 + u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), # 224 - 231 + u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), # 232 - 239 + u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), # 240 - 247 + u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') # 248 - 255 ) assert len(_pdfDocEncoding) == 256 @@ -1259,5 +1263,5 @@ def decode_pdfdocencoding(byte_array): char = _pdfDocEncoding[i] if char == u_("\u0000"): continue - assert char not in _pdfDocEncoding_rev + assert char not in _pdfDocEncoding_rev, str(char) + " at " + str(i) + " already at " + str(_pdfDocEncoding_rev[char]) _pdfDocEncoding_rev[char] = i diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 53ea01bd5..9695e704c 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -146,7 +146,7 @@ def set_need_appearances_writer(self): self._root_object["/AcroForm"][need_appearances] = BooleanObject(True) except Exception as e: - print('set_need_appearances_writer() catch : ', repr(e)) + logger.error('set_need_appearances_writer() catch : ', repr(e)) def addPage(self, page): """ @@ -371,7 +371,7 @@ def appendPagesFromReader(self, reader, after_page_append=None): # Trigger callback, pass writer page as parameter if callable(after_page_append): after_page_append(writer_page) - def updatePageFormFieldValues(self, page, fields): + def updatePageFormFieldValues(self, page, fields, flags=0): ''' Update the form field values for a given page from a fields dictionary. Copy field texts and values from fields to page. @@ -381,6 +381,9 @@ def updatePageFormFieldValues(self, page, fields): and field data will be updated. :param fields: a Python dictionary of field names (/T) and text values (/V) + :param flags: An integer (0 to 7). The first bit sets ReadOnly, the + second bit sets Required, the third bit sets NoExport. See + PDF Reference Table 8.70 for details. ''' # Iterate through pages, update field values for j in range(0, len(page[PG.ANNOTS])): @@ -394,6 +397,8 @@ def updatePageFormFieldValues(self, page, fields): writer_annot.update({ NameObject("/V"): TextStringObject(fields[field]) }) + if flags: + writer_annot.update({NameObject("/Ff"): NumberObject(flags)}) elif writer_parent_annot.get('/T') == field: writer_parent_annot.update({ NameObject("/V"): TextStringObject(fields[field]) @@ -424,7 +429,7 @@ def cloneDocumentFromReader(self, reader, after_page_append=None): self.cloneReaderDocumentRoot(reader) self.appendPagesFromReader(reader, after_page_append) - def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True, permissions_flag=-1): """ Encrypt this PDF file with the PDF Standard encryption handler. @@ -436,6 +441,13 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): :param bool use_128bit: flag as to whether to use 128bit encryption. When false, 40bit encryption will be used. By default, this flag is on. + :param unsigned int permissions_flag: permissions as described in + TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the + permission is grantend. Hence an integer value of -1 will set all + flags. + Bit position 3 is for printing, 4 is for modifying content, 5 and 6 + control annotations, 9 for form fields, 10 for extraction of + text and graphics. """ import random import time @@ -449,8 +461,7 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): V = 1 rev = 2 keylen = int(40 / 8) - # permit everything: - P = -1 + P = permissions_flag O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) @@ -623,7 +634,7 @@ def _sweepIndirectReferences(self, externMap, data): newobj = self._sweepIndirectReferences(externMap, newobj) self._objects[idnum-1] = newobj return newobj_ido - except ValueError: + except (ValueError, RecursionError): # Unable to resolve the Object, returning NullObject instead. warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format( data.__class__.__name__, data @@ -2071,7 +2082,7 @@ def _pairs(self, array): def readNextEndLine(self, stream, limit_offset=0): debug = False if debug: print(">>readNextEndLine") - line = b_("") + line_parts = [] while True: # Prevent infinite loops in malformed PDFs if stream.tell() == 0 or stream.tell() == limit_offset: @@ -2098,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0): break else: if debug: print(" x is neither") - line = x + line - if debug: print((" RNEL line:", line)) + line_parts.append(x) if debug: print("leaving RNEL") - return line + line_parts.reverse() + return b"".join(line_parts) def decrypt(self, password): """ @@ -2766,7 +2777,7 @@ def compressContentStreams(self): content = ContentStream(content, self.pdf) self[NameObject("/Contents")] = content.flateEncode() - def extractText(self, Tj_sep="", TJ_sep=" "): + def extractText(self, Tj_sep="", TJ_sep=""): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF @@ -2808,6 +2819,15 @@ def extractText(self, Tj_sep="", TJ_sep=" "): if isinstance(i, TextStringObject): text += TJ_sep text += i + elif isinstance(i, NumberObject): + # a positive value decreases and the negative value increases + # space + if int(i) < 0: + if len(text) == 0 or text[-1] != " ": + text += " " + else: + if len(text) > 1 and text[-1] == " ": + text = text[:-1] text += "\n" return text diff --git a/Resources/crazyones.txt b/Resources/crazyones.txt index 468a57e90..bf55e4ca2 100644 --- a/Resources/crazyones.txt +++ b/Resources/crazyones.txt @@ -1 +1,18 @@ - The Cr azy Ones Octob er 14, 1998 Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers. The round p egs in the square holes. The ones who see things di˙eren tly . Theyre not fond of rules. And they ha v e no resp ect for the status quo. Y ou can quote them, disagree with them, glorify or vilify them. Ab out the only thing y ou cant do is ignore them. Because they c hange things. They in v en t. They imagine. They heal. They explore. They create. They inspire. They push the h uman race forw ard. Ma yb e they ha v e to b e crazy . Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or sit in silence and hear a song thats nev er b een written? Or gaze at a red planet and see a lab oratory on wheels? W e mak e to ols for these kinds of p eople. While some see them as the crazy ones, w e see genius. Because the p eople who are crazy enough to think they can c hange the w orld, are the ones who do. \ No newline at end of file +The Crazy Ones +October 14, 1998 +Heres to the crazy ones. The mis˝ts. The reb els. The troublemakers. +The round p egs in the square holes. +The ones who see things di˙erently. Theyre not fond of rules. And +they have no resp ect for the status quo. You can quote them, +disagree with them, glorify or vilify them. +Ab out the only thing you cant do is ignore them. Because they change +things. They invent. They imagine. They heal. They explore. They +create. They inspire. They push the human race forward. +Mayb e they have to b e crazy. +How else can you stare at an empty canvas and see a work of art? Or +sit in silence and hear a song thats never b een written? Or gaze at +a red planet and see a lab oratory on wheels? +We make to ols for these kinds of p eople. +While some see them as the crazy ones, we see genius. Because the +p eople who are crazy enough to think they can change the world, +are the ones who do. diff --git a/Tests/test_reader.py b/Tests/test_reader.py index d38b32ab9..467645e6e 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -1,5 +1,7 @@ import io import os +import time +from sys import version_info import pytest @@ -10,6 +12,15 @@ from PyPDF2.errors import PdfReadError from PyPDF2.filters import _xobj_to_image +if version_info < (3, 0): + from cStringIO import StringIO + + StreamIO = StringIO +else: + from io import BytesIO + + StreamIO = BytesIO + TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources") @@ -462,3 +473,16 @@ def test_get_destination_age_number(): for outline in outlines: if not isinstance(outline, list): reader.getDestinationPageNumber(outline) + + +def test_do_not_get_stuck_on_large_files_without_start_xref(): + """Tests for the absence of a DoS bug, where a large file without an startxref mark + would cause the library to hang for minutes to hours""" + start_time = time.time() + broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000) + with pytest.raises(PdfReadError): + PdfFileReader(broken_stream) + parse_duration = time.time() - start_time + # parsing is expected take less than a second on a modern cpu, but include a large + # tolerance to account for busy or slow systems + assert parse_duration < 60 diff --git a/Tests/test_workflows.py b/Tests/test_workflows.py index eb313fa8c..f607d1059 100644 --- a/Tests/test_workflows.py +++ b/Tests/test_workflows.py @@ -31,9 +31,12 @@ def test_PdfReaderFileLoad(): with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file: pdftext = pdftext_file.read() - text = page.extractText().replace("\n", "").encode("utf-8") + text = page.extractText(Tj_sep="", TJ_sep="").encode("utf-8") # Compare the text of the PDF to a known source + for expected_line, actual_line in zip(text.split(b"\n"), pdftext.split(b"\n")): + assert expected_line == actual_line + assert text == pdftext, ( "PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text) diff --git a/Tests/test_writer.py b/Tests/test_writer.py index 22d35a6da..fe3cdc327 100644 --- a/Tests/test_writer.py +++ b/Tests/test_writer.py @@ -96,7 +96,8 @@ def test_remove_images(input_path, ignoreByteStringObject): with open(tmp_filename, "rb") as input_stream: reader = PdfFileReader(input_stream) if input_path == "side-by-side-subfig.pdf": - assert "Lorem ipsum dolor sit amet" in reader.getPage(0).extractText() + extracted_text = reader.getPage(0).extractText() + assert "Lorem ipsum dolor sit amet" in extracted_text # Cleanup os.remove(tmp_filename) @@ -166,7 +167,9 @@ def test_fill_form(): writer.addPage(page) - writer.updatePageFormFieldValues(writer.getPage(0), {"foo": "some filled in text"}) + writer.updatePageFormFieldValues( + writer.getPage(0), {"foo": "some filled in text"}, flags=1 + ) # write "output" to PyPDF2-output.pdf tmp_filename = "dont_commit_filled_pdf.pdf"