Merge branch 'main' into paper-sizes

py-pdf · Apr 24, 2022 · 9573fb2 · 9573fb2
2 parents f46ef74 + d1be80d
commit 9573fb2
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 48 deletions.
diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -1217,39 +1217,43 @@ def decode_pdfdocencoding(byte_array):
         retval += c
     return retval
 
+# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
+# C.1 Predefined encodings sorted by character name of another PDF reference
+# Some indices have '\u0000' although they should have something else:
+# 22: should be '\u0017'
 _pdfDocEncoding = (
-  u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
-  u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
-  u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
-  u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
-  u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
-  u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
-  u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
-  u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
-  u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
-  u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
-  u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
-  u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
-  u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
-  u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
-  u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
-  u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
-  u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
-  u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
-  u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
-  u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
-  u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
-  u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
-  u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
-  u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
-  u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
-  u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
-  u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
-  u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
-  u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
-  u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
-  u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
-  u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
+  u_('\u0000'), u_('\u0001'), u_('\u0002'), u_('\u0003'), u_('\u0004'), u_('\u0005'), u_('\u0006'), u_('\u0007'), #  0 -  7
+  u_('\u0008'), u_('\u0009'), u_('\u000a'), u_('\u000b'), u_('\u000c'), u_('\u000d'), u_('\u000e'), u_('\u000f'), #  8 - 15
+  u_('\u0010'), u_('\u0011'), u_('\u0012'), u_('\u0013'), u_('\u0014'), u_('\u0015'), u_('\u0000'), u_('\u0017'), # 16 - 23
+  u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), # 24 - 31
+  u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), # 32 - 39
+  u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), # 40 - 47
+  u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), # 48 - 55
+  u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), # 56 - 63
+  u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), # 64 - 71
+  u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), # 72 - 79
+  u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), # 80 - 87
+  u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), # 88 - 95
+  u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), # 96 - 103
+  u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), # 104 - 111
+  u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), # 112 - 119
+  u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), # 120 - 127
+  u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), # 128 - 135
+  u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), # 136 - 143
+  u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), # 144 - 151
+  u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), # 152 - 159
+  u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), # 160 - 167
+  u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), # 168 - 175
+  u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), # 176 - 183
+  u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), # 184 - 191
+  u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), # 192 - 199
+  u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), # 200 - 207
+  u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), # 208 - 215
+  u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), # 216 - 223
+  u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), # 224 - 231
+  u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), # 232 - 239
+  u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), # 240 - 247
+  u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')  # 248 - 255
 )
 
 assert len(_pdfDocEncoding) == 256
@@ -1259,5 +1263,5 @@ def decode_pdfdocencoding(byte_array):
     char = _pdfDocEncoding[i]
     if char == u_("\u0000"):
         continue
-    assert char not in _pdfDocEncoding_rev
+    assert char not in _pdfDocEncoding_rev, str(char) + " at " + str(i) + " already at " + str(_pdfDocEncoding_rev[char])
     _pdfDocEncoding_rev[char] = i
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -146,7 +146,7 @@ def set_need_appearances_writer(self):
             self._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
 
         except Exception as e:
-            print('set_need_appearances_writer() catch : ', repr(e))
+            logger.error('set_need_appearances_writer() catch : ', repr(e))
 
     def addPage(self, page):
         """
@@ -371,7 +371,7 @@ def appendPagesFromReader(self, reader, after_page_append=None):
             # Trigger callback, pass writer page as parameter
             if callable(after_page_append): after_page_append(writer_page)
 
-    def updatePageFormFieldValues(self, page, fields):
+    def updatePageFormFieldValues(self, page, fields, flags=0):
         '''
         Update the form field values for a given page from a fields dictionary.
         Copy field texts and values from fields to page.
@@ -381,6 +381,9 @@ def updatePageFormFieldValues(self, page, fields):
             and field data will be updated.
         :param fields: a Python dictionary of field names (/T) and text
             values (/V)
+        :param flags: An integer (0 to 7). The first bit sets ReadOnly, the
+            second bit sets Required, the third bit sets NoExport. See
+            PDF Reference Table 8.70 for details.
         '''
         # Iterate through pages, update field values
         for j in range(0, len(page[PG.ANNOTS])):
@@ -394,6 +397,8 @@ def updatePageFormFieldValues(self, page, fields):
                     writer_annot.update({
                         NameObject("/V"): TextStringObject(fields[field])
                     })
+                    if flags:
+                        writer_annot.update({NameObject("/Ff"): NumberObject(flags)})
                 elif writer_parent_annot.get('/T') == field:
                     writer_parent_annot.update({
                         NameObject("/V"): TextStringObject(fields[field])
@@ -424,7 +429,7 @@ def cloneDocumentFromReader(self, reader, after_page_append=None):
         self.cloneReaderDocumentRoot(reader)
         self.appendPagesFromReader(reader, after_page_append)
 
-    def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
+    def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True, permissions_flag=-1):
         """
         Encrypt this PDF file with the PDF Standard encryption handler.
 
@@ -436,6 +441,13 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
         :param bool use_128bit: flag as to whether to use 128bit
             encryption.  When false, 40bit encryption will be used.  By default,
             this flag is on.
+        :param unsigned int permissions_flag: permissions as described in
+            TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the
+            permission is grantend. Hence an integer value of -1 will set all
+            flags.
+            Bit position 3 is for printing, 4 is for modifying content, 5 and 6
+            control annotations, 9 for form fields, 10 for extraction of
+            text and graphics.
         """
         import random
         import time
@@ -449,8 +461,7 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
             V = 1
             rev = 2
             keylen = int(40 / 8)
-        # permit everything:
-        P = -1
+        P = permissions_flag
         O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
         ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
         ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
@@ -623,7 +634,7 @@ def _sweepIndirectReferences(self, externMap, data):
                         newobj = self._sweepIndirectReferences(externMap, newobj)
                         self._objects[idnum-1] = newobj
                         return newobj_ido
-                    except ValueError:
+                    except (ValueError, RecursionError):
                         # Unable to resolve the Object, returning NullObject instead.
                         warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format(
                             data.__class__.__name__, data
@@ -2071,7 +2082,7 @@ def _pairs(self, array):
     def readNextEndLine(self, stream, limit_offset=0):
         debug = False
         if debug: print(">>readNextEndLine")
-        line = b_("")
+        line_parts = []
         while True:
             # Prevent infinite loops in malformed PDFs
             if stream.tell() == 0 or stream.tell() == limit_offset:
@@ -2098,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0):
                 break
             else:
                 if debug: print("  x is neither")
-                line = x + line
-                if debug: print(("  RNEL line:", line))
+                line_parts.append(x)
         if debug: print("leaving RNEL")
-        return line
+        line_parts.reverse()
+        return b"".join(line_parts)
 
     def decrypt(self, password):
         """
@@ -2766,7 +2777,7 @@ def compressContentStreams(self):
                 content = ContentStream(content, self.pdf)
             self[NameObject("/Contents")] = content.flateEncode()
 
-    def extractText(self, Tj_sep="", TJ_sep=" "):
+    def extractText(self, Tj_sep="", TJ_sep=""):
         """
         Locate all text drawing commands, in the order they are provided in the
         content stream, and extract the text.  This works well for some PDF
@@ -2808,6 +2819,15 @@ def extractText(self, Tj_sep="", TJ_sep=" "):
                     if isinstance(i, TextStringObject):
                         text += TJ_sep
                         text += i
+                    elif isinstance(i, NumberObject):
+                        # a positive value decreases and the negative value increases
+                        # space
+                        if int(i) < 0:
+                            if len(text) == 0 or text[-1] != " ":
+                                text += " "
+                        else:
+                            if len(text) > 1 and text[-1] == " ":
+                                text = text[:-1]
                 text += "\n"
         return text
 

diff --git a/Resources/crazyones.txt b/Resources/crazyones.txt
@@ -1 +1,18 @@
- The Cr azy Ones Octob er 14, 1998 Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers. The round p egs in the square holes. The ones who see things di˙eren tly . Theyre not fond of rules. And they ha v e no resp ect for the status quo. Y ou can quote them, disagree with them, glorify or vilify them. Ab out the only thing y ou cant do is ignore them. Because they c hange things. They in v en t. They imagine. They heal. They explore. They create. They inspire. They push the h uman race forw ard. Ma yb e they ha v e to b e crazy . Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or sit in silence and hear a song thats nev er b een written? Or gaze at a red planet and see a lab oratory on wheels? W e mak e to ols for these kinds of p eople. While some see them as the crazy ones, w e see genius. Because the p eople who are crazy enough to think they can c hange the w orld, are the ones who do.
+The Crazy Ones
+October 14, 1998
+Heres to the crazy ones. The mis˝ts. The reb els. The troublemakers.
+The round p egs in the square holes.
+The ones who see things di˙erently. Theyre not fond of rules. And
+they have no resp ect for the status quo. You can quote them,
+disagree with them, glorify or vilify them.
+Ab out the only thing you cant do is ignore them. Because they change
+things. They invent. They imagine. They heal. They explore. They
+create. They inspire. They push the human race forward.
+Mayb e they have to b e crazy.
+How else can you stare at an empty canvas and see a work of art? Or
+sit in silence and hear a song thats never b een written? Or gaze at
+a red planet and see a lab oratory on wheels?
+We make to ols for these kinds of p eople.
+While some see them as the crazy ones, we see genius. Because the
+p eople who are crazy enough to think they can change the world,
+are the ones who do.
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -1,5 +1,7 @@
 import io
 import os
+import time
+from sys import version_info
 
 import pytest
 
@@ -10,6 +12,15 @@
 from PyPDF2.errors import PdfReadError
 from PyPDF2.filters import _xobj_to_image
 
+if version_info < (3, 0):
+    from cStringIO import StringIO
+
+    StreamIO = StringIO
+else:
+    from io import BytesIO
+
+    StreamIO = BytesIO
+
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
 PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
 RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
@@ -462,3 +473,16 @@ def test_get_destination_age_number():
     for outline in outlines:
         if not isinstance(outline, list):
             reader.getDestinationPageNumber(outline)
+
+
+def test_do_not_get_stuck_on_large_files_without_start_xref():
+    """Tests for the absence of a DoS bug, where a large file without an startxref mark
+    would cause the library to hang for minutes to hours"""
+    start_time = time.time()
+    broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
+    with pytest.raises(PdfReadError):
+        PdfFileReader(broken_stream)
+    parse_duration = time.time() - start_time
+    # parsing is expected take less than a second on a modern cpu, but include a large
+    # tolerance to account for busy or slow systems
+    assert parse_duration < 60
diff --git a/Tests/test_workflows.py b/Tests/test_workflows.py
@@ -31,9 +31,12 @@ def test_PdfReaderFileLoad():
         with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
             pdftext = pdftext_file.read()
 
-        text = page.extractText().replace("\n", "").encode("utf-8")
+        text = page.extractText(Tj_sep="", TJ_sep="").encode("utf-8")
 
         # Compare the text of the PDF to a known source
+        for expected_line, actual_line in zip(text.split(b"\n"), pdftext.split(b"\n")):
+            assert expected_line == actual_line
+
         assert text == pdftext, (
             "PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
             % (pdftext, text)

diff --git a/Tests/test_writer.py b/Tests/test_writer.py
@@ -96,7 +96,8 @@ def test_remove_images(input_path, ignoreByteStringObject):
     with open(tmp_filename, "rb") as input_stream:
         reader = PdfFileReader(input_stream)
         if input_path == "side-by-side-subfig.pdf":
-            assert "Lorem ipsum dolor sit amet" in reader.getPage(0).extractText()
+            extracted_text = reader.getPage(0).extractText()
+            assert "Lorem ipsum dolor sit amet" in extracted_text
 
     # Cleanup
     os.remove(tmp_filename)
@@ -166,7 +167,9 @@ def test_fill_form():
 
     writer.addPage(page)
 
-    writer.updatePageFormFieldValues(writer.getPage(0), {"foo": "some filled in text"})
+    writer.updatePageFormFieldValues(
+        writer.getPage(0), {"foo": "some filled in text"}, flags=1
+    )
 
     # write "output" to PyPDF2-output.pdf
     tmp_filename = "dont_commit_filled_pdf.pdf"