Skip to content

Commit

Permalink
Merge branch 'main' into paper-sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Apr 24, 2022
2 parents f46ef74 + d1be80d commit 9573fb2
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 48 deletions.
70 changes: 37 additions & 33 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1217,39 +1217,43 @@ def decode_pdfdocencoding(byte_array):
retval += c
return retval

# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
# C.1 Predefined encodings sorted by character name of another PDF reference
# Some indices have '\u0000' although they should have something else:
# 22: should be '\u0017'
_pdfDocEncoding = (
u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
u_('\u0000'), u_('\u0001'), u_('\u0002'), u_('\u0003'), u_('\u0004'), u_('\u0005'), u_('\u0006'), u_('\u0007'), # 0 - 7
u_('\u0008'), u_('\u0009'), u_('\u000a'), u_('\u000b'), u_('\u000c'), u_('\u000d'), u_('\u000e'), u_('\u000f'), # 8 - 15
u_('\u0010'), u_('\u0011'), u_('\u0012'), u_('\u0013'), u_('\u0014'), u_('\u0015'), u_('\u0000'), u_('\u0017'), # 16 - 23
u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), # 24 - 31
u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), # 32 - 39
u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), # 40 - 47
u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), # 48 - 55
u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), # 56 - 63
u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), # 64 - 71
u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), # 72 - 79
u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), # 80 - 87
u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), # 88 - 95
u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), # 96 - 103
u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), # 104 - 111
u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), # 112 - 119
u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), # 120 - 127
u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), # 128 - 135
u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), # 136 - 143
u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), # 144 - 151
u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), # 152 - 159
u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), # 160 - 167
u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), # 168 - 175
u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), # 176 - 183
u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), # 184 - 191
u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), # 192 - 199
u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), # 200 - 207
u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), # 208 - 215
u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), # 216 - 223
u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), # 224 - 231
u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), # 232 - 239
u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), # 240 - 247
u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') # 248 - 255
)

assert len(_pdfDocEncoding) == 256
Expand All @@ -1259,5 +1263,5 @@ def decode_pdfdocencoding(byte_array):
char = _pdfDocEncoding[i]
if char == u_("\u0000"):
continue
assert char not in _pdfDocEncoding_rev
assert char not in _pdfDocEncoding_rev, str(char) + " at " + str(i) + " already at " + str(_pdfDocEncoding_rev[char])
_pdfDocEncoding_rev[char] = i
42 changes: 31 additions & 11 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def set_need_appearances_writer(self):
self._root_object["/AcroForm"][need_appearances] = BooleanObject(True)

except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
logger.error('set_need_appearances_writer() catch : ', repr(e))

def addPage(self, page):
"""
Expand Down Expand Up @@ -371,7 +371,7 @@ def appendPagesFromReader(self, reader, after_page_append=None):
# Trigger callback, pass writer page as parameter
if callable(after_page_append): after_page_append(writer_page)

def updatePageFormFieldValues(self, page, fields):
def updatePageFormFieldValues(self, page, fields, flags=0):
'''
Update the form field values for a given page from a fields dictionary.
Copy field texts and values from fields to page.
Expand All @@ -381,6 +381,9 @@ def updatePageFormFieldValues(self, page, fields):
and field data will be updated.
:param fields: a Python dictionary of field names (/T) and text
values (/V)
:param flags: An integer (0 to 7). The first bit sets ReadOnly, the
second bit sets Required, the third bit sets NoExport. See
PDF Reference Table 8.70 for details.
'''
# Iterate through pages, update field values
for j in range(0, len(page[PG.ANNOTS])):
Expand All @@ -394,6 +397,8 @@ def updatePageFormFieldValues(self, page, fields):
writer_annot.update({
NameObject("/V"): TextStringObject(fields[field])
})
if flags:
writer_annot.update({NameObject("/Ff"): NumberObject(flags)})
elif writer_parent_annot.get('/T') == field:
writer_parent_annot.update({
NameObject("/V"): TextStringObject(fields[field])
Expand Down Expand Up @@ -424,7 +429,7 @@ def cloneDocumentFromReader(self, reader, after_page_append=None):
self.cloneReaderDocumentRoot(reader)
self.appendPagesFromReader(reader, after_page_append)

def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True, permissions_flag=-1):
"""
Encrypt this PDF file with the PDF Standard encryption handler.
Expand All @@ -436,6 +441,13 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
:param bool use_128bit: flag as to whether to use 128bit
encryption. When false, 40bit encryption will be used. By default,
this flag is on.
:param unsigned int permissions_flag: permissions as described in
TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the
permission is grantend. Hence an integer value of -1 will set all
flags.
Bit position 3 is for printing, 4 is for modifying content, 5 and 6
control annotations, 9 for form fields, 10 for extraction of
text and graphics.
"""
import random
import time
Expand All @@ -449,8 +461,7 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
V = 1
rev = 2
keylen = int(40 / 8)
# permit everything:
P = -1
P = permissions_flag
O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
Expand Down Expand Up @@ -623,7 +634,7 @@ def _sweepIndirectReferences(self, externMap, data):
newobj = self._sweepIndirectReferences(externMap, newobj)
self._objects[idnum-1] = newobj
return newobj_ido
except ValueError:
except (ValueError, RecursionError):
# Unable to resolve the Object, returning NullObject instead.
warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format(
data.__class__.__name__, data
Expand Down Expand Up @@ -2071,7 +2082,7 @@ def _pairs(self, array):
def readNextEndLine(self, stream, limit_offset=0):
debug = False
if debug: print(">>readNextEndLine")
line = b_("")
line_parts = []
while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0 or stream.tell() == limit_offset:
Expand All @@ -2098,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0):
break
else:
if debug: print(" x is neither")
line = x + line
if debug: print((" RNEL line:", line))
line_parts.append(x)
if debug: print("leaving RNEL")
return line
line_parts.reverse()
return b"".join(line_parts)

def decrypt(self, password):
"""
Expand Down Expand Up @@ -2766,7 +2777,7 @@ def compressContentStreams(self):
content = ContentStream(content, self.pdf)
self[NameObject("/Contents")] = content.flateEncode()

def extractText(self, Tj_sep="", TJ_sep=" "):
def extractText(self, Tj_sep="", TJ_sep=""):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
Expand Down Expand Up @@ -2808,6 +2819,15 @@ def extractText(self, Tj_sep="", TJ_sep=" "):
if isinstance(i, TextStringObject):
text += TJ_sep
text += i
elif isinstance(i, NumberObject):
# a positive value decreases and the negative value increases
# space
if int(i) < 0:
if len(text) == 0 or text[-1] != " ":
text += " "
else:
if len(text) > 1 and text[-1] == " ":
text = text[:-1]
text += "\n"
return text

Expand Down
19 changes: 18 additions & 1 deletion Resources/crazyones.txt
Original file line number Diff line number Diff line change
@@ -1 +1,18 @@
The Cr azy Ones Octob er 14, 1998 Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers. The round p egs in the square holes. The ones who see things di˙eren tly . Theyre not fond of rules. And they ha v e no resp ect for the status quo. Y ou can quote them, disagree with them, glorify or vilify them. Ab out the only thing y ou cant do is ignore them. Because they c hange things. They in v en t. They imagine. They heal. They explore. They create. They inspire. They push the h uman race forw ard. Ma yb e they ha v e to b e crazy . Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or sit in silence and hear a song thats nev er b een written? Or gaze at a red planet and see a lab oratory on wheels? W e mak e to ols for these kinds of p eople. While some see them as the crazy ones, w e see genius. Because the p eople who are crazy enough to think they can c hange the w orld, are the ones who do.
The Crazy Ones
October 14, 1998
Heres to the crazy ones. The mis˝ts. The reb els. The troublemakers.
The round p egs in the square holes.
The ones who see things di˙erently. Theyre not fond of rules. And
they have no resp ect for the status quo. You can quote them,
disagree with them, glorify or vilify them.
Ab out the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.
Mayb e they have to b e crazy.
How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never b een written? Or gaze at
a red planet and see a lab oratory on wheels?
We make to ols for these kinds of p eople.
While some see them as the crazy ones, we see genius. Because the
p eople who are crazy enough to think they can change the world,
are the ones who do.
24 changes: 24 additions & 0 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import io
import os
import time
from sys import version_info

import pytest

Expand All @@ -10,6 +12,15 @@
from PyPDF2.errors import PdfReadError
from PyPDF2.filters import _xobj_to_image

if version_info < (3, 0):
from cStringIO import StringIO

StreamIO = StringIO
else:
from io import BytesIO

StreamIO = BytesIO

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
Expand Down Expand Up @@ -462,3 +473,16 @@ def test_get_destination_age_number():
for outline in outlines:
if not isinstance(outline, list):
reader.getDestinationPageNumber(outline)


def test_do_not_get_stuck_on_large_files_without_start_xref():
"""Tests for the absence of a DoS bug, where a large file without an startxref mark
would cause the library to hang for minutes to hours"""
start_time = time.time()
broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
with pytest.raises(PdfReadError):
PdfFileReader(broken_stream)
parse_duration = time.time() - start_time
# parsing is expected take less than a second on a modern cpu, but include a large
# tolerance to account for busy or slow systems
assert parse_duration < 60
5 changes: 4 additions & 1 deletion Tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@ def test_PdfReaderFileLoad():
with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extractText().replace("\n", "").encode("utf-8")
text = page.extractText(Tj_sep="", TJ_sep="").encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.split(b"\n"), pdftext.split(b"\n")):
assert expected_line == actual_line

assert text == pdftext, (
"PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
% (pdftext, text)
Expand Down
7 changes: 5 additions & 2 deletions Tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def test_remove_images(input_path, ignoreByteStringObject):
with open(tmp_filename, "rb") as input_stream:
reader = PdfFileReader(input_stream)
if input_path == "side-by-side-subfig.pdf":
assert "Lorem ipsum dolor sit amet" in reader.getPage(0).extractText()
extracted_text = reader.getPage(0).extractText()
assert "Lorem ipsum dolor sit amet" in extracted_text

# Cleanup
os.remove(tmp_filename)
Expand Down Expand Up @@ -166,7 +167,9 @@ def test_fill_form():

writer.addPage(page)

writer.updatePageFormFieldValues(writer.getPage(0), {"foo": "some filled in text"})
writer.updatePageFormFieldValues(
writer.getPage(0), {"foo": "some filled in text"}, flags=1
)

# write "output" to PyPDF2-output.pdf
tmp_filename = "dont_commit_filled_pdf.pdf"
Expand Down

0 comments on commit 9573fb2

Please sign in to comment.