From 0b1858ed8b2793999a67ed5c1aeef41b72de0527 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Tue, 26 Apr 2022 20:07:31 +0200 Subject: [PATCH 1/6] TST: Add tests for utils --- PyPDF2/filters.py | 30 ++++++++++++++++++++++-------- PyPDF2/generic.py | 7 ++++--- PyPDF2/merger.py | 4 ++-- PyPDF2/pdf.py | 11 +++++------ Tests/test_utils.py | 6 ++++++ sample-files | 2 +- 6 files changed, 40 insertions(+), 20 deletions(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index d3901da7d..c70fd2544 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -220,7 +220,7 @@ def decode(data, decodeParms=None): v is such that 0 <= ord(v) <= 127. """ retval = "" - char = "" + hex_pair = "" x = 0 while True: if x >= len(data): @@ -231,12 +231,12 @@ def decode(data, decodeParms=None): elif c.isspace(): x += 1 continue - char += c - if len(char) == 2: - retval += chr(int(char, base=16)) - char = "" + hex_pair += c + if len(hex_pair) == 2: + retval += chr(int(hex_pair, base=16)) + hex_pair = "" x += 1 - assert char == "" + assert hex_pair == "" return retval @@ -244,6 +244,7 @@ class LZWDecode(object): """Taken from: http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm """ + class decoder(object): def __init__(self, data): self.STOP=257 @@ -281,9 +282,15 @@ def nextCode(self): return value def decode(self): - """ algorithm derived from: + """ + TIFF 6.0 specification explains in sufficient details the steps to + implement the LZW encode() and decode() algorithms. + + algorithm derived from: http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html and the PDFReference + + :rtype: bytes """ cW = self.CLEARDICT baos="" @@ -316,10 +323,17 @@ def decode(self): @staticmethod def decode(data, decodeParms=None): + """ + :param data: ``bytes`` or ``str`` text to decode. + :param decodeParms: a dictionary of parameter values. + :return: decoded data. + :rtype: bytes + """ return LZWDecode.decoder(data).decode() class ASCII85Decode(object): + """Decodes string ASCII85-encoded data into a byte format.""" @staticmethod def decode(data, decodeParms=None): if version_info < ( 3, 0 ): @@ -485,7 +499,7 @@ def decodeStreamData(stream): else: raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") else: - # unsupported filter + # Unsupported filter raise NotImplementedError("unsupported filter %s" % filterType) return data diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index bed824696..a7759663a 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -28,7 +28,7 @@ """ -Implementation of generic PDF objects (dictionary, number, string, and so on) +Implementation of generic PDF objects (dictionary, number, string, and so on). """ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" @@ -75,8 +75,9 @@ def readObject(stream, pdf): elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) - stream.seek(-2, 1) # reset to start - if peek == b_('<<'): + stream.seek(-2, 1) # reset to start + + if peek == b_("<<"): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index 6e74c52f8..d78b8ac4c 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -57,7 +57,7 @@ def __init__(self, pagedata, src, id): class PdfFileMerger(object): """ - Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs + Initializes a ``PdfFileMerger`` object. ``PdfFileMerger`` merges multiple PDFs into a single PDF. It can concatenate, slice, insert, or any combination of the above. @@ -205,7 +205,6 @@ def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): :param bool import_bookmarks: You may prevent the source document's bookmarks from being imported by specifying this as ``False``. """ - self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) def write(self, fileobj): @@ -363,6 +362,7 @@ def _write_dests(self): pageno = i pdf = p.src # noqa: F841 break + if pageno is not None: self.output.addNamedDestinationObject(v) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index cfe2b6d16..5434ddcec 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -276,8 +276,7 @@ def addAttachment(self, fname, fdata): https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf Section 7.11.3 """ - - # We need 3 entries: + # We need three entries: # * The file's data # * The /Filespec entry # * The file's name, which goes in the Catalog @@ -1092,9 +1091,9 @@ def getPageLayout(self): def setPageLayout(self, layout): """ - Set the page layout + Set the page layout. - :param str layout: The page layout to be used + :param str layout: The page layout to be used. .. list-table:: Valid ``layout`` arguments :widths: 50 200 @@ -1133,7 +1132,7 @@ def getPageMode(self): of valid modes. :return: Page mode currently being used. - :rtype: str, None if not specified + :rtype: str, None if not specified. """ try: return self._root_object['/PageMode'] @@ -1192,7 +1191,7 @@ class PdfFileReader(object): """ def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True): if overwriteWarnings: - # have to dynamically override the default showwarning since there are no + # Have to dynamically override the default showwarning since there are no # public methods that specify the 'file' parameter def _showwarning(message, category, filename, lineno, file=warndest, line=None): if file is None: diff --git a/Tests/test_utils.py b/Tests/test_utils.py index c72f66778..f6a8140f4 100644 --- a/Tests/test_utils.py +++ b/Tests/test_utils.py @@ -100,3 +100,9 @@ def test_ConvertFunctionsToVirtualList(): def test_hexStr(): assert PyPDF2.utils.hexStr(10) == "0xa" + + +def test_b(): + assert PyPDF2.utils.b_("foo") == b"foo" + assert PyPDF2.utils.b_("😀") == "😀".encode("utf-8") + assert PyPDF2.utils.b_("‰") == "‰".encode("utf-8") diff --git a/sample-files b/sample-files index 6e3a1bb2c..99e32fff7 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 6e3a1bb2c72eaa3406f5e596479953950f91152c +Subproject commit 99e32fff7002b5308e84696547010c249e5cfb0a From 0f2f35ed81576e581b14e65d1ae03e528fc64432 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Tue, 26 Apr 2022 20:30:05 +0200 Subject: [PATCH 2/6] Add Test --- Tests/test_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Tests/test_utils.py b/Tests/test_utils.py index f6a8140f4..67308b501 100644 --- a/Tests/test_utils.py +++ b/Tests/test_utils.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import io import os @@ -102,7 +103,9 @@ def test_hexStr(): assert PyPDF2.utils.hexStr(10) == "0xa" +@pytest.mark.no_py27 def test_b(): assert PyPDF2.utils.b_("foo") == b"foo" assert PyPDF2.utils.b_("😀") == "😀".encode("utf-8") assert PyPDF2.utils.b_("‰") == "‰".encode("utf-8") + assert PyPDF2.utils.b_("▷") == "▷".encode("utf-8") From abd58e8fc5eab931f64cfc794dda95e1909d2bda Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Tue, 26 Apr 2022 21:21:11 +0200 Subject: [PATCH 3/6] Form fields --- Tests/test_reader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 44dd8e9b7..1988b3ecc 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -263,7 +263,7 @@ def test_get_page_of_encrypted_file(): @pytest.mark.parametrize( - "src,expected,expected_method", + "src,expected,expected_get_fields", [ ( "form.pdf", @@ -280,9 +280,14 @@ def test_get_page_of_encrypted_file(): {"foo": "bar"}, {"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": "bar"}}, ), + ( + "crazyones.pdf", + {}, + None, + ), ], ) -def test_get_form(src, expected, expected_method): +def test_get_form(src, expected, expected_get_fields): """Check if we can read out form data.""" src = os.path.join(RESOURCE_ROOT, src) reader = PdfFileReader(src) @@ -290,7 +295,7 @@ def test_get_form(src, expected, expected_method): assert fields == expected fields = reader.getFields() - assert fields == expected_method + assert fields == expected_get_fields @pytest.mark.parametrize( From 7ca0cd808471a5cdae51ba4328ea0f64a054723b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Tue, 26 Apr 2022 22:12:56 +0200 Subject: [PATCH 4/6] Eliminate error source --- PyPDF2/pdf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 5434ddcec..1fbf20724 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1412,6 +1412,8 @@ def getFormTextFields(self): ''' # Retrieve document form fields formfields = self.getFields() + if formfields is None: + return {} return { formfields[field]['/T']: formfields[field].get('/V') for field in formfields \ if formfields[field].get('/FT') == '/Tx' From ad7e0b8f7cc6d9f569e67030d8b0f6d4e7bbb51b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Tue, 26 Apr 2022 22:14:03 +0200 Subject: [PATCH 5/6] Expand writer test --- Tests/test_writer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Tests/test_writer.py b/Tests/test_writer.py index fe3cdc327..e2ef5cdb0 100644 --- a/Tests/test_writer.py +++ b/Tests/test_writer.py @@ -226,10 +226,16 @@ def test_add_named_destination(): from PyPDF2.pdf import NameObject writer.addNamedDestination(NameObject("A named dest"), 2) + writer.addNamedDestination(NameObject("A named dest2"), 2) from PyPDF2.pdf import IndirectObject - assert writer.getNamedDestRoot() == ["A named dest", IndirectObject(7, 0, writer)] + assert writer.getNamedDestRoot() == [ + "A named dest", + IndirectObject(7, 0, writer), + "A named dest2", + IndirectObject(10, 0, writer), + ] # write "output" to PyPDF2-output.pdf tmp_filename = "dont_commit_named_destination.pdf" From f79eba21e24c284589ed7ca5318ffd1e3a112dbc Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Tue, 26 Apr 2022 22:18:55 +0200 Subject: [PATCH 6/6] PageRange tests --- Tests/test_pagerange.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Tests/test_pagerange.py b/Tests/test_pagerange.py index b8d8fa825..801f5b08d 100644 --- a/Tests/test_pagerange.py +++ b/Tests/test_pagerange.py @@ -10,7 +10,8 @@ def test_equality(): @pytest.mark.parametrize( - "page_range,expected", [(slice(0, 5), "0:5"), (slice(0, 5, 2), "0:5:2")] + "page_range,expected", + [(slice(0, 5), "0:5"), (slice(0, 5, 2), "0:5:2"), ("-1", "-1:"), ("0", "0")], ) def test_str(page_range, expected): assert str(PageRange(page_range)) == expected